diff --git a/CMakeLists.txt b/CMakeLists.txt
index a28613647b32c44c472917b10cdcab7acab843d1..7a8f5e0a69aac3852cb2752c90d54d8f50b69483 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,12 +16,6 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 include(lite_utils)
 
-lite_option(WITH_PADDLE_MOBILE   "Use the paddle-mobile legacy build"    OFF)
-if (WITH_PADDLE_MOBILE)
-    add_subdirectory(mobile)
-    return()
-endif(WITH_PADDLE_MOBILE)
-
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(CMAKE_CXX_STANDARD 11)
diff --git a/README.md b/README.md
index 70c53a5775148c6608008d0a86a6966aca29c644..d995bcc327705228098c1b26753213928ad4a79d 100644
--- a/README.md
+++ b/README.md
@@ -43,7 +43,6 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 - [iOS示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/ios_app_demo.html)
 - [ARMLinux示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/linux_arm_demo.html)
 - [X86示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)
-- [CUDA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/cuda.html)
 - [OpenCL示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/opencl.html)
 - [FPGA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/fpga.html)
 - [华为NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/huawei_kirin_npu.html)
@@ -77,7 +76,6 @@ Paddle Lite提供了C++、Java、Python三种API，并且提供了相应API的
 | CPU(32bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | CPU(64bit) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) |
 | OpenCL | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
-| CUDA | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
 | FPGA | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
 | 华为NPU | - | - | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - |
 | 百度 XPU | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | ![Build Status](https://img.shields.io/badge/build-passing-brightgreen.svg) | - | - |
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 69fba7968d75f0308acdc787313b48c2804d6caf..e980922d5b4869ede65e57e750b5b85676ed0dde 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -199,13 +199,10 @@ if (LITE_WITH_EXCEPTION)
   add_definitions("-DLITE_WITH_EXCEPTION")
 endif()
 
-if (LITE_ON_FLATBUFFERS_DESC_VIEW)
-  add_definitions("-DLITE_ON_FLATBUFFERS_DESC_VIEW")
-  message(STATUS "Flatbuffers will be used as cpp default program description.")
-endif()
-
 if (LITE_ON_TINY_PUBLISH)
   add_definitions("-DLITE_ON_TINY_PUBLISH")
+  add_definitions("-DLITE_ON_FLATBUFFERS_DESC_VIEW")
+  message(STATUS "Flatbuffers will be used as cpp default program description.")
 else()
   add_definitions("-DLITE_WITH_FLATBUFFERS_DESC")
 endif()
diff --git a/cmake/device/huawei_ascend_npu.cmake b/cmake/device/huawei_ascend_npu.cmake
index 0bd9591eee702f4db914a8b547c4c99b21d0473b..a2b664abd13591214b9955993854ebccea9a4bf4 100644
--- a/cmake/device/huawei_ascend_npu.cmake
+++ b/cmake/device/huawei_ascend_npu.cmake
@@ -16,6 +16,11 @@ if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
   return()
 endif()
 
+# require -D_GLIBCXX_USE_CXX11_ABI=0 if GCC 7.3.0
+if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 5.0)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+endif()
+
 # 1. path to Huawei Ascend Install Path
 if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
     set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake
index 4c2413c620d3531399ceede234eed16e9f4f0b6b..47b3042234cfa482ca7187baf8e51275ea8d3ac8 100644
--- a/cmake/external/flatbuffers.cmake
+++ b/cmake/external/flatbuffers.cmake
@@ -27,7 +27,7 @@ SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
 SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
 SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
 IF(WIN32)
-  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
 ELSE(WIN32)
   set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
 ENDIF(WIN32)
@@ -64,13 +64,6 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-IF(WIN32)
-  IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
-    add_custom_command(TARGET extern_flatbuffers POST_BUILD
-            COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
-            )
-  ENDIF()
-ENDIF(WIN32)
 ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
 ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 76cc7b21deab41a40869a68df3a4dce359177c21..eb6c26e38dcd86aa4e0a536ea0f4541651bed6fa 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -217,6 +217,10 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64")
     ENDIF()
 
+    IF(LITE_WITH_HUAWEI_ASCEND_NPU)
+        SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}")
+    ENDIF()
+
     if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
         ExternalProject_Add(
             ${TARGET_NAME}
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index d859404d559282970d96a735c400f745481e8efa..af05db559123e6d7305c35f95e3dacd58eeb7e19 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -267,6 +267,10 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
+      if("${cc_library_DEPS};" MATCHES "fbs_headers;")
+        list(REMOVE_ITEM cc_library_DEPS fbs_headers)
+        add_dependencies(${TARGET_NAME} fbs_headers)
+      endif()
       # Only deps libmklml.so, not link
       if("${cc_library_DEPS};" MATCHES "mklml;")
         list(REMOVE_ITEM cc_library_DEPS mklml)
diff --git a/docs/api_reference/cv.md b/docs/api_reference/cv.md
index d660bd7e382d80ac7151acacef3fd30caeb902bc..2192f4c7bbd1c020e65f5485c9292716ae12df84 100644
--- a/docs/api_reference/cv.md
+++ b/docs/api_reference/cv.md
@@ -91,14 +91,24 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat, ImageFormat dstFormat, T
     // 方法二
     void ImagePreprocess::imageCovert(const uint8_t* src,
     uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat);
+    // 方法三
+    void ImagePreprocess::imageCovert(const uint8_t* src,
+    uint8_t* dst, ImageFormat srcFormat, ImageFormat dstFormat,
+    int srcw, int srch);
     ```
 
     + 第一个 `imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
         - param srcFormat：`ImagePreprocess` 类的成员变量`srcFormat_`
         - param dstFormat：`ImagePreprocess` 类的成员变量`dstFormat_`
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
     
-    - 第二个`imageCovert` 接口，可以直接使用
+    - 第二个`imageCovert` 接口，缺省参数来源于 `ImagePreprocess` 类的成员变量。故在初始化 `ImagePreprocess` 类的对象时，必须要给以下成员变量赋值：
+        - param srcw: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`iw`变量
+        - param srch: `ImagePreprocess` 类的成员变量`transParam_`结构体中的`ih`变量
 
+    - 第二个`imageCovert` 接口, 可以直接使用
+    
 ### 缩放 Resize
 
 `Resize` 功能支持颜色空间：GRAY、NV12（NV21）、RGB（BGR）和RGBA（BGRA）
diff --git a/docs/demo_guides/baidu_xpu.md b/docs/demo_guides/baidu_xpu.md
index 242188e0fd1397494db545757e0679c0fd957da1..ae60f9038707218fd204369f4b3ebbbda82f7aca 100644
--- a/docs/demo_guides/baidu_xpu.md
+++ b/docs/demo_guides/baidu_xpu.md
@@ -16,69 +16,12 @@ Paddle Lite已支持百度XPU在x86和arm服务器（例如飞腾 FT-2000+/64）
 
 ### 已支持的Paddle模型
 
-- [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz)
-- [BERT](https://paddlelite-demo.bj.bcebos.com/models/bert_fp32_fluid.tar.gz)
-- [ERNIE](https://paddlelite-demo.bj.bcebos.com/models/ernie_fp32_fluid.tar.gz)
-- YOLOv3
-- Mask R-CNN
-- Faster R-CNN
-- UNet
-- SENet
-- SSD
+- [开源模型支持列表](../introduction/support_model_list)
 - 百度内部业务模型（由于涉密，不方便透露具体细节）
 
 ### 已支持（或部分支持）的Paddle算子（Kernel接入方式）
 
-- scale
-- relu
-- tanh
-- sigmoid
-- stack
-- matmul
-- pool2d
-- slice
-- lookup_table
-- elementwise_add
-- elementwise_sub
-- cast
-- batch_norm
-- mul
-- layer_norm
-- softmax
-- conv2d
-- io_copy
-- io_copy_once
-- __xpu__fc
-- __xpu__multi_encoder
-- __xpu__resnet50
-- __xpu__embedding_with_eltwise_add
-
-### 已支持（或部分支持）的Paddle算子（子图/XTCL接入方式）
-
-- relu
-- tanh
-- conv2d
-- depthwise_conv2d
-- elementwise_add
-- pool2d
-- softmax
-- mul
-- batch_norm
-- stack
-- gather
-- scale
-- lookup_table
-- slice
-- transpose
-- transpose2
-- reshape
-- reshape2
-- layer_norm
-- gelu
-- dropout
-- matmul
-- cast
-- yolo_box
+- [算子支持列表](../introduction/support_operation_list)
 
 
 ## 参考示例演示
@@ -233,7 +176,7 @@ $ ./lite/tools/build.sh --arm_os=armlinux --arm_abi=armv8 --arm_lang=gcc --build
 ```
 
 - 将编译生成的build.lite.x86/inference_lite_lib/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/amd64/include目录；
-- 将编译生成的build.lite.x86/inference_lite_lib/cxx/include/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件；
+- 将编译生成的build.lite.x86/inference_lite_lib/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/amd64/lib/libpaddle_full_api_shared.so文件；
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/include替换PaddleLite-linux-demo/libs/PaddleLite/arm64/include目录；
 - 将编译生成的build.lite.armlinux.armv8.gcc/inference_lite_lib.armlinux.armv8.xpu/cxx/lib/libpaddle_full_api_shared.so替换PaddleLite-linux-demo/libs/PaddleLite/arm64/lib/libpaddle_full_api_shared.so文件。
 
diff --git a/docs/demo_guides/cuda.md b/docs/demo_guides/cuda.md
index f863fd86864194c6d022e4cf1fc75eb46725cc2c..6460d327a4f30753a2d6942d4a931f709641e3ab 100644
--- a/docs/demo_guides/cuda.md
+++ b/docs/demo_guides/cuda.md
@@ -1,5 +1,7 @@
 # PaddleLite使用CUDA预测部署
 
+**注意**: Lite CUDA仅作为Nvidia GPU加速库，支持模型有限，如有需要请使用[PaddleInference](https://paddle-inference.readthedocs.io/en/latest)。
+
 Lite支持在x86_64，arm64架构上（如：TX2）进行CUDA的编译运行。
 
 ## 编译
diff --git a/docs/images/architecture.png b/docs/images/architecture.png
index 1af783d77dbd52923aa5facc90e00633c908f575..9397ed49a8a0071cf25b4551438d24a86de96bbb 100644
Binary files a/docs/images/architecture.png and b/docs/images/architecture.png differ
diff --git a/docs/images/workflow.png b/docs/images/workflow.png
new file mode 100644
index 0000000000000000000000000000000000000000..98201e78e1a35c830231881d19fb2c0acbdbaeba
Binary files /dev/null and b/docs/images/workflow.png differ
diff --git a/docs/index.rst b/docs/index.rst
index 24dac7f3692649f99bbeabafab53896c2221c29c..88170c3f6ee177b55631b008c888cb88eda866d3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -57,7 +57,6 @@ Welcome to Paddle-Lite's documentation!
   demo_guides/ios_app_demo
   demo_guides/linux_arm_demo
   demo_guides/x86
-  demo_guides/cuda
   demo_guides/opencl
   demo_guides/fpga
   demo_guides/huawei_kirin_npu
diff --git a/docs/introduction/architecture.md b/docs/introduction/architecture.md
index 1a94494af0b44a03988266d341be5788c46f96c2..8af678a5bf2bb1355e21df91752b777c466faee9 100644
--- a/docs/introduction/architecture.md
+++ b/docs/introduction/architecture.md
@@ -5,23 +5,25 @@ Mobile 在这次升级为 Lite 架构， 侧重多硬件、高性能的支持，
 - 引入 Type system，强化多硬件、量化方法、data layout 的混合调度能力
 - 硬件细节隔离，通过不同编译开关，对支持的任何硬件可以自由插拔
 - 引入 MIR(Machine IR) 的概念，强化带执行环境下的优化支持
-- 优化期和执行期严格隔离，保证预测时轻量和高效率
+- 图优化模块和执行引擎实现了良好的解耦拆分，保证预测执行阶段的轻量和高效率
 
 架构图如下
 
-![Paddle Inference Refactor1.0](https://user-images.githubusercontent.com/52520497/64949619-26e49580-d8ac-11e9-855a-514feb9b75af.png)
+<p align="center"><img width="500" src="https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/docs/images/architecture.png"/></p>
 
-## 编译期和执行期严格隔离设计
+## 模型优化阶段和预测执行阶段的隔离设计
 
-- compile time 优化完毕可以将优化信息存储到模型中；execution time 载入并执行
-- 两套 API 及对应的预测lib，满足不同场景
-  - `CxxPredictor` 打包了 `Compile Time` 和 `Execution Time`，可以 runtime 在具体硬件上做分析和优化，得到最优效果
-  - `MobilePredictor` 只打包 `Execution Time`，保持部署和执行的轻量
+- Analysis Phase为模型优化阶段，输入为Paddle的推理模型，通过Lite的模型加速和优化策略对计算图进行相关的优化分析，包含算子融合，计算裁剪，存储优化，量化精度转换、存储优化、Kernel优选等多类图优化手段。优化后的模型更轻量级，在相应的硬件上运行时耗费资源更少，并且执行速度也更快。
+- Execution Phase为预测执行阶段，输入为优化后的Lite模型，仅做模型加载和预测执行两步操作，支持极致的轻量级部署，无任何第三方依赖。
 
-## `Execution Time` 轻量级设计和实现
+Lite设计了两套 API 及对应的预测库，满足不同场景需求：
+  - `CxxPredictor` 同时包含 `Analysis Phase` 和 `Execution Phase`，支持一站式的预测任务，同时支持模型进行分析优化与预测执行任务，适用于对预测库大小不敏感的硬件场景。
+  - `MobilePredictor` 只包含 `Execution Phase`，保持预测部署和执行的轻量级和高性能，支持从内存或者文件中加载优化后的模型，并进行预测执行。
 
-- 每个 batch 实际执行只包含两个步骤执行
-  - `Op.InferShape`
+## Execution Phase轻量级设计和实现
+
+- 在预测执行阶段，每个 batch 实际执行只包含两个步骤执行
+  - `OpLite.InferShape` 基于输入推断得到输出的维度
   - `Kernel.Run`，Kernel 相关参数均使用指针提前确定，后续无查找或传参消耗
   - 设计目标，执行时，只有 kernel 计算本身消耗
 - 轻量级 `Op` 及 `Kernel` 设计，避免框架额外消耗
diff --git a/docs/introduction/support_hardware.md b/docs/introduction/support_hardware.md
index b1a6823d26d4fe8838afee00732707608b836599..3fa1b358aba0b2dd01328fad0e81efc95d75450d 100644
--- a/docs/introduction/support_hardware.md
+++ b/docs/introduction/support_hardware.md
@@ -29,7 +29,8 @@ Paddle Lite支持[ARM Cortex-A系列处理器](https://en.wikipedia.org/wiki/ARM
 Paddle Lite支持移动端GPU和Nvidia端上GPU设备，支持列表如下：
 - ARM Mali G 系列
 - Qualcomm Adreno 系列
-- Nvida tegra系列: tx1, tx2, nano, xavier
+  
+  Nvida tegra系列: tx1, tx2, nano, xavier
 
 ## NPU
 Paddle Lite支持NPU，支持列表如下：
diff --git a/docs/introduction/support_model_list.md b/docs/introduction/support_model_list.md
index b30bcd729929de06848285bb27a4d38cec723e67..11f39134b5457703cc00b2dde93d5ab286e48636 100644
--- a/docs/introduction/support_model_list.md
+++ b/docs/introduction/support_model_list.md
@@ -1,32 +1,38 @@
 # 支持模型
 
-目前已严格验证24个模型的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持，并在不断丰富中。
+目前已严格验证28个模型的精度和性能，对视觉类模型做到了较为充分的支持，覆盖分类、检测和定位，包含了特色的OCR模型的支持，并在不断丰富中。
 
-| 类别 | 类别细分 | 模型 | 支持Int8 | 支持平台 |
-|-|-|:-:|:-:|-:|
-| CV  | 分类 | mobilenetv1 | Y | ARM，X86，NPU，RKNPU，APU |
-| CV  | 分类 | mobilenetv2 | Y | ARM，X86，NPU |
-| CV  | 分类 | resnet18 | Y | ARM，NPU |
-| CV  | 分类 | resnet50 | Y | ARM，X86，NPU，XPU |
-| CV  | 分类 | mnasnet |  | ARM，NPU |
-| CV  | 分类 | efficientnet |  | ARM |
-| CV  | 分类 | squeezenetv1.1 |  | ARM，NPU |
-| CV  | 分类 | ShufflenetV2 | Y | ARM |
-| CV  | 分类 | shufflenet | Y | ARM |
-| CV  | 分类 | inceptionv4 | Y | ARM，X86，NPU |
-| CV  | 分类 | vgg16 | Y | ARM |
-| CV  | 分类 | googlenet | Y  | ARM，X86 |
-| CV  | 检测 | mobilenet_ssd | Y | ARM，NPU* |
-| CV  | 检测 | mobilenet_yolov3 | Y | ARM，NPU* |
-| CV | 检测 | Faster RCNN |  | ARM |
-| CV | 检测 | Mask RCNN |  | ARM |
-| CV | 分割 | Deeplabv3 | Y | ARM |
-| CV  | 分割 | unet |  | ARM |
-| CV  | 人脸 | facedetection |  | ARM |
-| CV  | 人脸 | facebox |  | ARM |
-| CV  | 人脸 | blazeface | Y | ARM |
-| CV  | 人脸 | mtcnn |  | ARM |
-| CV  | OCR | ocr_attention |  | ARM |
-| NLP  | 机器翻译 | transformer |  | ARM，NPU* |
+| 类别 | 类别细分 | 模型 | 支持平台 |
+|-|-|:-|:-|
+| CV | 分类 | [MobileNetV1](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v1_fp32_224_fluid.tar.gz) | ARM，X86，NPU，RKNPU，APU |
+| CV | 分类 | [MobileNetV2](https://paddlelite-demo.bj.bcebos.com/models/mobilenet_v2_fp32_224_fluid.tar.gz) | ARM，X86，NPU |
+| CV | 分类 | [ResNet18](https://paddlelite-demo.bj.bcebos.com/models/resnet18_fp32_224_fluid.tar.gz) | ARM，NPU |
+| CV | 分类 | [ResNet50](https://paddlelite-demo.bj.bcebos.com/models/resnet50_fp32_224_fluid.tar.gz) | ARM，X86，NPU，XPU |
+| CV | 分类 | [MnasNet](https://paddlelite-demo.bj.bcebos.com/models/mnasnet_fp32_224_fluid.tar.gz) | ARM，NPU |
+| CV | 分类 | [EfficientNet*](https://github.com/PaddlePaddle/PaddleClas) | ARM |
+| CV | 分类 | [SqueezeNet](https://paddlelite-demo.bj.bcebos.com/models/squeezenet_fp32_224_fluid.tar.gz) | ARM，NPU |
+| CV | 分类 | [ShufflenetV2*](https://github.com/PaddlePaddle/PaddleClas) | ARM |
+| CV | 分类 | [ShuffleNet](https://paddlepaddle-inference-banchmark.bj.bcebos.com/shufflenet_inference.tar.gz) | ARM |
+| CV | 分类 | [InceptionV4](https://paddle-inference-dist.bj.bcebos.com/inception_v4_simple.tar.gz) | ARM，X86，NPU |
+| CV | 分类 | [VGG16](https://paddlepaddle-inference-banchmark.bj.bcebos.com/VGG16_inference.tar) | ARM |
+| CV | 分类 | [VGG19](https://paddlepaddle-inference-banchmark.bj.bcebos.com/VGG19_inference.tar) | XPU|
+| CV | 分类 | [GoogleNet](https://paddlepaddle-inference-banchmark.bj.bcebos.com/GoogleNet_inference.tar) | ARM，X86，XPU |
+| CV | 检测 | [MobileNet-SSD](https://paddlelite-demo.bj.bcebos.com/models/ssd_mobilenet_v1_pascalvoc_fp32_300_fluid.tar.gz) | ARM，NPU* |
+| CV | 检测 | [YOLOv3-MobileNetV3](https://paddlelite-demo.bj.bcebos.com/models/yolov3_mobilenet_v3_prune86_FPGM_320_fp32_fluid.tar.gz) | ARM，NPU* |
+| CV | 检测 | [Faster RCNN](https://paddlepaddle-inference-banchmark.bj.bcebos.com/faster_rcnn.tar) | ARM |
+| CV | 检测 | [Mask RCNN*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/MODEL_ZOO_cn.md) | ARM |
+| CV | 分割 | [Deeplabv3](https://paddlelite-demo.bj.bcebos.com/models/deeplab_mobilenet_fp32_fluid.tar.gz) | ARM |
+| CV | 分割 | [UNet](https://paddlelite-demo.bj.bcebos.com/models/Unet.zip) | ARM |
+| CV | 人脸 | [FaceDetection](https://paddlelite-demo.bj.bcebos.com/models/facedetection_fp32_240_430_fluid.tar.gz) | ARM |
+| CV | 人脸 | [FaceBoxes*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/featured_model/FACE_DETECTION.md#FaceBoxes) | ARM |
+| CV | 人脸 | [BlazeFace*](https://github.com/PaddlePaddle/PaddleDetection/blob/release/0.4/docs/featured_model/FACE_DETECTION.md#BlazeFace) | ARM |
+| CV | 人脸 | [MTCNN](https://paddlelite-demo.bj.bcebos.com/models/mtcnn.zip) | ARM |
+| CV | OCR | [OCR-Attention](https://paddle-inference-dist.bj.bcebos.com/ocr_attention.tar.gz) | ARM |
+| CV | GAN | [CycleGAN*](https://github.com/PaddlePaddle/models/tree/release/1.7/PaddleCV/gan/cycle_gan) | NPU |
+| NLP | 机器翻译 | [Transformer*](https://github.com/PaddlePaddle/models/tree/release/1.8/PaddleNLP/machine_translation/transformer) | ARM，NPU* |
+| NLP | 机器翻译 | [BERT](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests/bert.tar.gz) | XPU |
+| NLP | 语义表示 | [ERNIE](https://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests/ernie.tar.gz) | XPU |
 
-> **注意：** NPU* 代表ARM+NPU异构计算
+**注意：** 
+1. 模型列表中 * 代表该模型链接来自[PaddlePaddle/models](https://github.com/PaddlePaddle/models)，否则为推理模型的下载链接
+2. 支持平台列表中 NPU* 代表ARM+NPU异构计算，否则为NPU计算
diff --git a/docs/quick_start/release_lib.md b/docs/quick_start/release_lib.md
index c2c441bbfa7dea0ae2ebd54f5545ae61590604ec..9c722df1537d49a2c7b8a009b5273b93ff68ffbe 100644
--- a/docs/quick_start/release_lib.md
+++ b/docs/quick_start/release_lib.md
@@ -76,7 +76,6 @@ pip install paddlelite
 - [ArmLinux源码编译](../source_compile/compile_linux)
 - [x86源码编译](../demo_guides/x86)
 - [opencl源码编译](../demo_guides/opencl)
-- [CUDA源码编译](../demo_guides/cuda)
 - [FPGA源码编译](../demo_guides/fpga)
 - [华为NPU源码编译](../demo_guides/huawei_kirin_npu)
 - [百度XPU源码编译](../demo_guides/baidu_xpu)
diff --git a/docs/quick_start/tutorial.md b/docs/quick_start/tutorial.md
index a7eb1327f812917e3f1609d097acaeec2a96997d..e5a63be350fe3111d480ba66e907b7f7613b1425 100644
--- a/docs/quick_start/tutorial.md
+++ b/docs/quick_start/tutorial.md
@@ -2,51 +2,63 @@
 
 Lite是一种轻量级、灵活性强、易于扩展的高性能的深度学习预测框架，它可以支持诸如ARM、OpenCL、NPU等等多种终端，同时拥有强大的图优化及预测加速能力。如果您希望将Lite框架集成到自己的项目中，那么只需要如下几步简单操作即可。
 
-## 一. 准备模型
 
-Lite框架目前支持的模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。因此，在您开始使用 Lite 框架前您需要准备一个由PaddlePaddle框架保存的模型。
-如果您手中的模型是由诸如Caffe2、Tensorflow等框架产出的，那么我们推荐您使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具进行模型格式转换。
+![workflow](https://raw.githubusercontent.com/PaddlePaddle/Paddle-Lite/develop/docs/images/workflow.png)
 
-## 二. 模型优化
+**一. 准备模型**
 
-Lite框架拥有强大的加速、优化策略及实现，其中包含诸如量化、子图融合、Kernel优选等等优化手段，为了方便您使用这些优化策略，我们提供了[opt](../user_guides/model_optimize_tool)帮助您轻松进行模型优化。优化后的模型更轻量级，耗费资源更少，并且执行速度也更快。
+Paddle Lite框架直接支持模型结构为[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)深度学习框架产出的模型格式。目前PaddlePaddle用于推理的模型是通过[save_inference_model](https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/io_cn/save_inference_model_cn.html#save-inference-model)这个API保存下来的。
+如果您手中的模型是由诸如Caffe、Tensorflow、PyTorch等框架产出的，那么您可以使用 [X2Paddle](https://github.com/PaddlePaddle/X2Paddle) 工具将模型转换为PadddlePaddle格式。
 
-opt的详细介绍，请您参考 [模型优化方法](../user_guides/model_optimize_tool)。
+**二. 模型优化**
 
-下载opt工具后执行以下代码：
+Paddle Lite框架拥有优秀的加速、优化策略及实现，包含量化、子图融合、Kernel优选等优化手段。优化后的模型更轻量级，耗费资源更少，并且执行速度也更快。
+这些优化通过Paddle Lite提供的opt工具实现。opt工具还可以统计并打印出模型中的算子信息，并判断不同硬件平台下Paddle Lite的支持情况。您获取PaddlePaddle格式的模型之后，一般需要通该opt工具做模型优化。opt工具的下载和使用，请参考 [模型优化方法](https://paddle-lite.readthedocs.io/zh/latest/user_guides/model_optimize_tool.html)。
 
-``` shell
-$ ./opt \
-    --model_dir=<model_param_dir> \
-    --model_file=<model_path> \
-    --param_file=<param_path> \
-    --optimize_out_type=(protobuf|naive_buffer) \
-    --optimize_out=<output_optimize_model_dir> \
-    --valid_targets=(arm|opencl|x86)
-```
+**注意**: 为了减少第三方库的依赖、提高Lite预测框架的通用性，在移动端使用Lite API您需要准备Naive Buffer存储格式的模型。
 
-其中，optimize_out为您希望的优化模型的输出路径。optimize_out_type则可以指定输出模型的序列化方式，其目前支持Protobuf与Naive Buffer两种方式，其中Naive Buffer是一种更轻量级的序列化/反序列化实现。如果你需要使用Lite在mobile端进行预测，那么您需要设置optimize_out_type=naive_buffer。
+**三. 下载或编译**
 
-## 三. 使用Lite框架执行预测
+Paddle Lite提供了Android/iOS/X86平台的官方Release预测库下载，我们优先推荐您直接下载 [Paddle Lite预编译库](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html)。
+您也可以根据目标平台选择对应的[源码编译方法](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#id2)。Paddle Lite 提供了源码编译脚本，位于 `lite/tools/`文件夹下，只需要 [准备环境](https://paddle-lite.readthedocs.io/zh/latest/source_compile/compile_env.html) 和 [调用编译脚本](https://paddle-lite.readthedocs.io/zh/latest/quick_start/release_lib.html#id2) 两个步骤即可一键编译得到目标平台的Paddle Lite预测库。
 
-在上一节中，我们已经通过`opt`获取到了优化后的模型，使用优化模型进行预测也十分的简单。为了方便您的使用，Lite进行了良好的API设计，隐藏了大量您不需要投入时间研究的细节。您只需要简单的五步即可使用Lite在移动端完成预测（以C++ API进行说明）：
+**四. 开发应用程序**
 
+Paddle Lite提供了C++、Java、Python三种API，只需简单五步即可完成预测（以C++ API为例）：
 
-1. 声明MobileConfig。在config中可以设置**从文件加载模型**也可以设置**从memory加载模型**。从文件加载模型需要声明模型文件路径，如 `config.set_model_from_file(FLAGS_model_file)` ；从memory加载模型方法现只支持加载优化后模型的naive buffer，实现方法为：
-`void set_model_from_buffer(model_buffer) `
+1. 声明`MobileConfig`，设置第二步优化后的模型文件路径，或选择从内存中加载模型
+2. 创建`Predictor`，调用`CreatePaddlePredictor`接口，一行代码即可完成引擎初始化
+3. 准备输入，通过`predictor->GetInput(i)`获取输入变量，并为其指定输入大小和输入值
+4. 执行预测，只需要运行`predictor->Run()`一行代码，即可使用Lite框架完成预测执行
+5. 获得输出，使用`predictor->GetOutput(i)`获取输出变量，并通过`data<T>`取得输出值
 
-2. 创建Predictor。Predictor即为Lite框架的预测引擎，为了方便您的使用我们提供了 `CreatePaddlePredictor` 接口，你只需要简单的执行一行代码即可完成预测引擎的初始化，`std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config)` 。
-3. 准备输入。执行predictor->GetInput(0)您将会获得输入的第0个field，同样的，如果您的模型有多个输入，那您可以执行 `predictor->GetInput(i)` 来获取相应的输入变量。得到输入变量后您可以使用Resize方法指定其具体大小，并填入输入值。
-4. 执行预测。您只需要执行 `predictor->Run()` 即可使用Lite框架完成预测。
-5. 获取输出。与输入类似，您可以使用 `predictor->GetOutput(i)` 来获得输出的第i个变量。您可以通过其shape()方法获取输出变量的维度，通过 `data<T>()` 模板方法获取其输出值。
+Paddle Lite提供了C++、Java、Python三种API的完整使用示例和开发说明文档，您可以参考示例中的说明快速了解使用方法，并集成到您自己的项目中去。
 
+- [C++完整示例](cpp_demo.html)
+- [Java完整示例](java_demo.html)
+- [Python完整示例](python_demo.html)
 
+针对不同的硬件平台，Paddle Lite提供了各个平台的完整示例：
 
+- [Android示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/android_app_demo.html)
+- [iOS示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/ios_app_demo.html)
+- [ARMLinux示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/linux_arm_demo.html)
+- [X86示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html)
+- [OpenCL示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/opencl.html)
+- [FPGA示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/fpga.html)
+- [华为NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/huawei_kirin_npu.html)
+- [百度XPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/baidu_xpu.html)
+- [瑞芯微NPU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/rockchip_npu.html)
+- [联发科APU示例](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/mediatek_apu.html)
 
-## 四. Lite API
+您也可以下载以下基于Paddle-Lite开发的预测APK程序，安装到Andriod平台上，先睹为快：
 
-为了方便您的使用，我们提供了C++、Java、Python三种API，并且提供了相应的api的完整使用示例:[C++完整示例](cpp_demo)、[Java完整示例](java_demo)、[Python完整示例](python_demo)，您可以参考示例中的说明快速了解C++/Java/Python的API使用方法，并集成到您自己的项目中去。需要说明的是，为了减少第三方库的依赖、提高Lite预测框架的通用性，在移动端使用Lite API您需要准备Naive Buffer存储格式的模型，具体方法可参考第2节`模型优化`。
+- [图像分类](https://paddlelite-demo.bj.bcebos.com/apps/android/mobilenet_classification_demo.apk)  
+- [目标检测](https://paddlelite-demo.bj.bcebos.com/apps/android/yolo_detection_demo.apk) 
+- [口罩检测](https://paddlelite-demo.bj.bcebos.com/apps/android/mask_detection_demo.apk)  
+- [人脸关键点](https://paddlelite-demo.bj.bcebos.com/apps/android/face_keypoints_detection_demo.apk) 
+- [人像分割](https://paddlelite-demo.bj.bcebos.com/apps/android/human_segmentation_demo.apk)
 
-## 五. 测试工具
+## 更多测试工具
 
 为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](../user_guides/debug) 和 [Profile工具](../user_guides/debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](../user_guides/debug) 了解更多内容。
diff --git a/docs/source_compile/compile_env.md b/docs/source_compile/compile_env.md
index 5322558afbf2c3ad09f04e0596ddc18f967ffabb..7c32311cda212091796a2cff7d60bbefbb751e7c 100644
--- a/docs/source_compile/compile_env.md
+++ b/docs/source_compile/compile_env.md
@@ -19,7 +19,6 @@ Paddle Lite提供了Android/iOS/X86平台的官方Release预测库下载，如
 - [ArmLinux源码编译](../source_compile/compile_linux)
 - [X86源码编译](../demo_guides/x86)
 - [OpenCL源码编译](../demo_guides/opencl)
-- [CUDA源码编译](../demo_guides/cuda)
 - [FPGA源码编译](../demo_guides/fpga)
 - [华为NPU源码编译](../demo_guides/huawei_kirin_npu)
 - [百度XPU源码编译](../demo_guides/baidu_xpu)
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 228b09bcff8a30869d7828a2a5a71fa0cb802292..d69f6d6d9e77668c5789baff3f2f1051afe5df46 100755
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -40,7 +40,8 @@ endif()
 if (WITH_TESTING)
     lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
     if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-	lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+	    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+	    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1_int16.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
@@ -51,11 +52,19 @@ if (WITH_TESTING)
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
+
+        set(LITE_URL_FOR_UNITTESTS "http://paddle-inference-dist.bj.bcebos.com/PaddleLite/models_and_data_for_unittests")
+        # models
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "resnet50.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ernie.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "GoogLeNet.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "VGG19.tar.gz")
+        # data
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "ILSVRC2012_small.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL_FOR_UNITTESTS} "bert_data.tar.gz")
     endif()
 endif()
 
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 5be30b1ea5ec649e81d4e28dca2f412816cef361..3e8fd5fd637c02842e068801278fab94ac7d5d4f 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -15,7 +15,6 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
     #full api dynamic library
     lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                   DEPS paddle_api paddle_api_light  paddle_api_full)
-    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
     add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry fbs_headers)
     target_link_libraries(paddle_full_api_shared framework_proto op_registry)
     if(LITE_WITH_X86)
@@ -70,6 +69,10 @@ else()
         set(TARGET_COMIPILE_FLAGS "-fdata-sections")
         if (NOT (ARM_TARGET_LANG STREQUAL "clang")) #gcc
             set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
+            # TODO (hong19860320): Disable lto temporarily since it causes fail to catch the exceptions in android when toolchain is gcc.
+            if (ARM_TARGET_OS STREQUAL "android" AND LITE_WITH_EXCEPTION)
+                set(TARGET_COMIPILE_FLAGS "")
+            endif()
         endif()
         set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
         add_dependencies(paddle_light_api_shared op_list_h kernel_list_h fbs_headers)
@@ -288,6 +291,14 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
        set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
        set_target_properties(test_mobilenetv1 PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
     endif()
+    
+    lite_cc_test(test_mobilenetv1_int16 SRCS mobilenetv1_int16_test.cc
+       DEPS ${lite_model_test_DEPS} ${light_lib_DEPS}
+       CL_DEPS ${opencl_kernels}
+       NPU_DEPS ${npu_kernels} ${npu_bridges}
+       ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
+            --model_dir=${LITE_MODEL_DIR}/mobilenet_v1_int16 SERIAL)
+    add_dependencies(test_mobilenetv1 extern_lite_download_mobilenet_v1_int16_tar_gz)
 
     lite_cc_test(test_mobilenetv2 SRCS mobilenetv2_test.cc
        DEPS ${lite_model_test_DEPS}
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index 4638ed5fdfb360c1475ad6e2d1a8eb2051673eb1..1aa9aeeeff6f2737aa3a2a31beaedb0dbf4184f8 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,7 +17,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
     # Unlike static library, module library has to link target to be able to work
     # as a single .so lib.
     target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
-    add_dependencies(paddle_lite_jni fbs_headers)
     if (LITE_WITH_NPU)
         # Strips the symbols of our protobuf functions to fix the conflicts during
         # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 1dccbb49a4b15a397ae37b1373b5df3cf95e7e9f..b72a6e6bdb2dd170460d0cbb2f3257e337625671 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -30,8 +30,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 3b3337139b3c5e3d475503ac682194a0ed348e4f..0b5b9ad94c47a3d97492cd5b91618b184c9ef122 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -58,6 +58,16 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
                                           config.mlu_input_layout(),
                                           config.mlu_firstconv_param());
 #endif  // LITE_WITH_MLU
+
+#ifdef LITE_WITH_BM
+    Env<TARGET(kBM)>::Init();
+    int device_id = 0;
+    if (const char *c_id = getenv("BM_VISIBLE_DEVICES")) {
+      device_id = static_cast<int>(*c_id) - 48;
+    }
+    TargetWrapper<TARGET(kBM)>::SetDevice(device_id);
+#endif  // LITE_WITH_BM
+
     auto use_layout_preprocess_pass =
         config.model_dir().find("OPENCL_PRE_PRECESS");
     VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
@@ -86,7 +96,7 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
       config.subgraph_model_cache_dir());
 #endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
-    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
   int num_threads = config.x86_math_library_num_threads();
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
diff --git a/lite/api/cxx_api_test.cc b/lite/api/cxx_api_test.cc
index 768480b1475c3609137f255cbac9ae9d4785a96b..8a28722799c4a2bb7f3512402b2f364fa84831ad 100644
--- a/lite/api/cxx_api_test.cc
+++ b/lite/api/cxx_api_test.cc
@@ -131,7 +131,8 @@ TEST(CXXApi, save_model) {
   predictor.Build(FLAGS_model_dir, "", "", valid_places);
 
   LOG(INFO) << "Save optimized model to " << FLAGS_optimized_model;
-  predictor.SaveModel(FLAGS_optimized_model);
+  predictor.SaveModel(FLAGS_optimized_model,
+                      lite_api::LiteModelType::kProtobuf);
   predictor.SaveModel(FLAGS_optimized_model + ".naive",
                       lite_api::LiteModelType::kNaiveBuffer);
 }
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index fbcf171726d741ef0073f423bc4a600c9f9389d0..56461fded536f87ee59ecc8efbe2d3463c7c3822 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -46,7 +46,6 @@ void LightPredictor::Build(const std::string& model_dir,
     case lite_api::LiteModelType::kProtobuf:
       LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
       break;
-#endif
     case lite_api::LiteModelType::kNaiveBuffer: {
       if (model_from_memory) {
         LoadModelNaiveFromMemory(
@@ -56,6 +55,7 @@ void LightPredictor::Build(const std::string& model_dir,
       }
       break;
     }
+#endif
     default:
       LOG(FATAL) << "Unknown model type";
   }
diff --git a/lite/api/light_api_impl.cc b/lite/api/light_api_impl.cc
index c9c34377e2a82b72d26e3148a694fe0662e985ce..3c5be7b9cdd340fe0fe82c589706c77875de0030 100644
--- a/lite/api/light_api_impl.cc
+++ b/lite/api/light_api_impl.cc
@@ -17,6 +17,10 @@
 #include "lite/api/paddle_api.h"
 #include "lite/core/version.h"
 #include "lite/model_parser/model_parser.h"
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#endif
 
 namespace paddle {
 namespace lite {
diff --git a/lite/api/mobilenetv1_int16_test.cc b/lite/api/mobilenetv1_int16_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..266052044ef6543a0f00ad50bc9b89b70656bbe6
--- /dev/null
+++ b/lite/api/mobilenetv1_int16_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/cxx_api.h"
+#include "lite/api/light_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/core/op_registry.h"
+
+DEFINE_string(optimized_model,
+              "/data/local/tmp/int16_model",
+              "optimized_model");
+DEFINE_int32(N, 1, "input_batch");
+DEFINE_int32(C, 3, "input_channel");
+DEFINE_int32(H, 224, "input_height");
+DEFINE_int32(W, 224, "input_width");
+
+namespace paddle {
+namespace lite {
+
+void TestModel(const std::vector<Place>& valid_places,
+               const std::string& model_dir) {
+  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(lite_api::LITE_POWER_NO_BIND, FLAGS_threads);
+
+  LOG(INFO) << "Optimize model.";
+  lite::Predictor cxx_predictor;
+  cxx_predictor.Build(model_dir, "", "", valid_places);
+  cxx_predictor.SaveModel(FLAGS_optimized_model,
+                          paddle::lite_api::LiteModelType::kNaiveBuffer);
+
+  LOG(INFO) << "Load optimized model.";
+  lite::LightPredictor predictor(FLAGS_optimized_model + ".nb", false);
+
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(
+      std::vector<DDim::value_type>({FLAGS_N, FLAGS_C, FLAGS_H, FLAGS_W})));
+  auto* data = input_tensor->mutable_data<float>();
+  auto item_size = FLAGS_N * FLAGS_C * FLAGS_H * FLAGS_W;
+  for (int i = 0; i < item_size; i++) {
+    data[i] = 1.;
+  }
+
+  LOG(INFO) << "Predictor run.";
+  predictor.Run();
+
+  auto* out = predictor.GetOutput(0);
+  const auto* pdata = out->data<float>();
+
+  std::vector<float> ref = {
+      0.000191383, 0.000592063, 0.000112282, 6.27426e-05, 0.000127522};
+  double eps = 1e-5;
+  for (int i = 0; i < ref.size(); ++i) {
+    EXPECT_NEAR(pdata[i], ref[i], eps);
+  }
+}
+
+TEST(MobileNetV1_Int16, test_arm) {
+  std::vector<Place> valid_places({
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  std::string model_dir = FLAGS_model_dir;
+  TestModel(valid_places, model_dir);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index 90575280873c8cda9310cfc951645f4614c2ce30..3cce247750341b37bf9aff07fce8ec54ee1428fe 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -25,8 +25,6 @@
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 #include <gflags/gflags.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 
 using paddle::lite::profile::Timer;
 
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index a3d29dff93155b4a1eaefd91d35080831601eedf..d37657206d093f666ab486dff5aa1c151efce0eb 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -356,5 +356,13 @@ void MobileConfig::set_model_buffer(const char *model_buffer,
   model_from_memory_ = true;
 }
 
+// This is the method for allocating workspace_size according to L3Cache size
+void MobileConfig::SetArmL3CacheSize(L3CacheSetMethod method,
+                                     int absolute_val) {
+#ifdef LITE_WITH_ARM
+  lite::DeviceInfo::Global().SetArmL3CacheSize(method, absolute_val);
+#endif
+}
+
 }  // namespace lite_api
 }  // namespace paddle
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 42a4b2228b5dc007bc0d6053f15e843bd6343c8f..7df7f7889af5b059a60aa191540a02e9f2ec755f 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -32,6 +32,14 @@ using shape_t = std::vector<int64_t>;
 using lod_t = std::vector<std::vector<uint64_t>>;
 
 enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
+// Methods for allocating L3Cache on Arm platform
+enum class L3CacheSetMethod {
+  kDeviceL3Cache = 0,  // Use the system L3 Cache size, best performance.
+  kDeviceL2Cache = 1,  // Use the system L2 Cache size, trade off performance
+                       // with less memory consumption.
+  kAbsolute = 2,       // Use the external setting.
+  // kAutoGrow = 3,   // Not supported yet, least memory consumption.
+};
 
 // return true if current device supports OpenCL model
 LITE_API bool IsOpenCLBackendValid();
@@ -294,6 +302,11 @@ class LITE_API MobileConfig : public ConfigBase {
 
   // NOTE: This is a deprecated API and will be removed in latter release.
   const std::string& param_buffer() const { return param_buffer_; }
+
+  // This is the method for allocating workspace_size according to L3Cache size
+  void SetArmL3CacheSize(
+      L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
+      int absolute_val = -1);
 };
 
 template <typename ConfigT>
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index c381546dfba9326d48b27e094a39dd4cd082c462..41799bdc2c6582e6d987d7d896db1f499eb4cdf4 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,8 +15,6 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"
 
@@ -109,7 +107,8 @@ TEST(CxxApi, share_external_data) {
 TEST(LightApi, run) {
   lite_api::MobileConfig config;
   config.set_model_from_file(FLAGS_model_dir + ".opt2.naive.nb");
-
+  // disable L3 cache on workspace_ allocating
+  config.SetArmL3CacheSize(L3CacheSetMethod::kDeviceL2Cache);
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   auto inputs = predictor->GetInputNames();
@@ -150,6 +149,8 @@ TEST(MobileConfig, LoadfromMemory) {
   // set model buffer and run model
   lite_api::MobileConfig config;
   config.set_model_from_buffer(model_buffer);
+  // allocate 1M initial space for workspace_
+  config.SetArmL3CacheSize(L3CacheSetMethod::kAbsolute, 1024 * 1024);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
   auto input_tensor = predictor->GetInput(0);
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index cea2a45c5db15891a4de679265a9c2cd2779d0fb..a4ea030cbf3ae7ead5836f02638ff440335f89fe 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -62,6 +62,7 @@ USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__resnet_d_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index 1f8ee66a0dbce37480672cc213a60d87d28c4142..b0b897b5d47089eb4331bf4909b4e778092a6a7b 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -9,7 +9,7 @@ if(WIN32)
    target_link_libraries(lite_pybind ${os_dependency_modules})
 else()
    lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
-   target_sources(lite_pybind PUBLIC ${__lite_cc_files})
+   target_sources(lite_pybind PUBLIC ${__lite_cc_files} fbs_headers)
 endif(WIN32)
 
 if (LITE_ON_TINY_PUBLISH)
diff --git a/lite/backends/apu/neuron_adapter.cc b/lite/backends/apu/neuron_adapter.cc
index 953c92d1828848bd030a65cb2a8af0eac0674ca1..ff08507504b8bd7e5342c5705afb17550f37469e 100644
--- a/lite/backends/apu/neuron_adapter.cc
+++ b/lite/backends/apu/neuron_adapter.cc
@@ -82,16 +82,20 @@ void NeuronAdapter::InitFunctions() {
   PADDLE_DLSYM(NeuronModel_setOperandValue);
   PADDLE_DLSYM(NeuronModel_setOperandSymmPerChannelQuantParams);
   PADDLE_DLSYM(NeuronModel_addOperation);
+  PADDLE_DLSYM(NeuronModel_addOperationExtension);
   PADDLE_DLSYM(NeuronModel_identifyInputsAndOutputs);
   PADDLE_DLSYM(NeuronCompilation_create);
   PADDLE_DLSYM(NeuronCompilation_free);
   PADDLE_DLSYM(NeuronCompilation_finish);
+  PADDLE_DLSYM(NeuronCompilation_createForDevices);
   PADDLE_DLSYM(NeuronExecution_create);
   PADDLE_DLSYM(NeuronExecution_free);
   PADDLE_DLSYM(NeuronExecution_setInput);
   PADDLE_DLSYM(NeuronExecution_setOutput);
   PADDLE_DLSYM(NeuronExecution_compute);
-
+  PADDLE_DLSYM(Neuron_getDeviceCount);
+  PADDLE_DLSYM(Neuron_getDevice);
+  PADDLE_DLSYM(NeuronDevice_getName);
 #undef PADDLE_DLSYM
 }
 
@@ -146,6 +150,25 @@ int NeuronModel_addOperation(NeuronModel* model,
       model, type, inputCount, inputs, outputCount, outputs);
 }
 
+int NeuronModel_addOperationExtension(NeuronModel* model,
+                                      const char* name,
+                                      const char* vendor,
+                                      const NeuronDevice* device,
+                                      uint32_t inputCount,
+                                      const uint32_t* inputs,
+                                      uint32_t outputCount,
+                                      const uint32_t* outputs) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronModel_addOperationExtension()(model,
+                                            name,
+                                            vendor,
+                                            device,
+                                            inputCount,
+                                            inputs,
+                                            outputCount,
+                                            outputs);
+}
+
 int NeuronModel_identifyInputsAndOutputs(NeuronModel* model,
                                          uint32_t inputCount,
                                          const uint32_t* inputs,
@@ -172,6 +195,15 @@ int NeuronCompilation_finish(NeuronCompilation* compilation) {
       compilation);
 }
 
+int NeuronCompilation_createForDevices(NeuronModel* model,
+                                       const NeuronDevice* const* devices,
+                                       uint32_t numDevices,
+                                       NeuronCompilation** compilation) {
+  return paddle::lite::NeuronAdapter::Global()
+      ->NeuronCompilation_createForDevices()(
+          model, devices, numDevices, compilation);
+}
+
 int NeuronExecution_create(NeuronCompilation* compilation,
                            NeuronExecution** execution) {
   return paddle::lite::NeuronAdapter::Global()->NeuronExecution_create()(
@@ -205,3 +237,18 @@ int NeuronExecution_compute(NeuronExecution* execution) {
   return paddle::lite::NeuronAdapter::Global()->NeuronExecution_compute()(
       execution);
 }
+
+int Neuron_getDeviceCount(uint32_t* numDevices) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDeviceCount()(
+      numDevices);
+}
+
+int Neuron_getDevice(uint32_t devIndex, NeuronDevice** device) {
+  return paddle::lite::NeuronAdapter::Global()->Neuron_getDevice()(devIndex,
+                                                                   device);
+}
+
+int NeuronDevice_getName(const NeuronDevice* device, const char** name) {
+  return paddle::lite::NeuronAdapter::Global()->NeuronDevice_getName()(device,
+                                                                       name);
+}
diff --git a/lite/backends/apu/neuron_adapter.h b/lite/backends/apu/neuron_adapter.h
index c08db73279ea3969300c8f298016a976e30a7ac4..c1b9669a98626699b126913dcc840906de4de8e0 100644
--- a/lite/backends/apu/neuron_adapter.h
+++ b/lite/backends/apu/neuron_adapter.h
@@ -42,12 +42,25 @@ class NeuronAdapter final {
                                                 const uint32_t *,
                                                 uint32_t,
                                                 const uint32_t *);
+  using NeuronModel_addOperationExtension_Type = int (*)(NeuronModel *,
+                                                         const char *,
+                                                         const char *,
+                                                         const NeuronDevice *,
+                                                         uint32_t,
+                                                         const uint32_t *,
+                                                         uint32_t,
+                                                         const uint32_t *);
   using NeuronModel_identifyInputsAndOutputs_Type = int (*)(
       NeuronModel *, uint32_t, const uint32_t *, uint32_t, const uint32_t *);
   using NeuronCompilation_create_Type = int (*)(NeuronModel *,
                                                 NeuronCompilation **);
   using NeuronCompilation_free_Type = void (*)(NeuronCompilation *);
   using NeuronCompilation_finish_Type = int (*)(NeuronCompilation *);
+  using NeuronCompilation_createForDevices_Type =
+      int (*)(NeuronModel *,
+              const NeuronDevice *const *,
+              uint32_t,
+              NeuronCompilation **);
   using NeuronExecution_create_Type = int (*)(NeuronCompilation *,
                                               NeuronExecution **);
   using NeuronExecution_free_Type = void (*)(NeuronExecution *);
@@ -59,6 +72,10 @@ class NeuronAdapter final {
   using NeuronExecution_setOutput_Type = int (*)(
       NeuronExecution *, int32_t, const NeuronOperandType *, void *, size_t);
   using NeuronExecution_compute_Type = int (*)(NeuronExecution *);
+  using Neuron_getDeviceCount_Type = int (*)(uint32_t *);
+  using Neuron_getDevice_Type = int (*)(uint32_t, NeuronDevice **);
+  using NeuronDevice_getName_Type = int (*)(const NeuronDevice *,
+                                            const char **);
 
   Neuron_getVersion_Type Neuron_getVersion() {
     CHECK(Neuron_getVersion_ != nullptr) << "Cannot load Neuron_getVersion!";
@@ -105,6 +122,12 @@ class NeuronAdapter final {
     return NeuronModel_addOperation_;
   }
 
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension() {
+    CHECK(NeuronModel_addOperationExtension_ != nullptr)
+        << "Cannot load NeuronModel_addOperationExtension!";
+    return NeuronModel_addOperationExtension_;
+  }
+
   NeuronModel_identifyInputsAndOutputs_Type
   NeuronModel_identifyInputsAndOutputs() {
     CHECK(NeuronModel_identifyInputsAndOutputs_ != nullptr)
@@ -130,6 +153,12 @@ class NeuronAdapter final {
     return NeuronCompilation_finish_;
   }
 
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices() {
+    CHECK(NeuronCompilation_createForDevices_ != nullptr)
+        << "Cannot load NeuronCompilation_createForDevices!";
+    return NeuronCompilation_createForDevices_;
+  }
+
   NeuronExecution_create_Type NeuronExecution_create() {
     CHECK(NeuronExecution_create_ != nullptr)
         << "Cannot load NeuronExecution_create!";
@@ -160,6 +189,23 @@ class NeuronAdapter final {
     return NeuronExecution_compute_;
   }
 
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount() {
+    CHECK(Neuron_getDeviceCount_ != nullptr)
+        << "Cannot load Neuron_getDeviceCount!";
+    return Neuron_getDeviceCount_;
+  }
+
+  Neuron_getDevice_Type Neuron_getDevice() {
+    CHECK(Neuron_getDevice_ != nullptr) << "Cannot load Neuron_getDevice!";
+    return Neuron_getDevice_;
+  }
+
+  NeuronDevice_getName_Type NeuronDevice_getName() {
+    CHECK(NeuronDevice_getName_ != nullptr)
+        << "Cannot load NeuronDevice_getName!";
+    return NeuronDevice_getName_;
+  }
+
  private:
   NeuronAdapter();
   NeuronAdapter(const NeuronAdapter &) = delete;
@@ -176,16 +222,23 @@ class NeuronAdapter final {
   NeuronModel_setOperandSymmPerChannelQuantParams_Type
       NeuronModel_setOperandSymmPerChannelQuantParams_{nullptr};
   NeuronModel_addOperation_Type NeuronModel_addOperation_{nullptr};
+  NeuronModel_addOperationExtension_Type NeuronModel_addOperationExtension_{
+      nullptr};
   NeuronModel_identifyInputsAndOutputs_Type
       NeuronModel_identifyInputsAndOutputs_{nullptr};
   NeuronCompilation_create_Type NeuronCompilation_create_{nullptr};
   NeuronCompilation_free_Type NeuronCompilation_free_{nullptr};
   NeuronCompilation_finish_Type NeuronCompilation_finish_{nullptr};
+  NeuronCompilation_createForDevices_Type NeuronCompilation_createForDevices_{
+      nullptr};
   NeuronExecution_create_Type NeuronExecution_create_{nullptr};
   NeuronExecution_free_Type NeuronExecution_free_{nullptr};
   NeuronExecution_setInput_Type NeuronExecution_setInput_{nullptr};
   NeuronExecution_setOutput_Type NeuronExecution_setOutput_{nullptr};
   NeuronExecution_compute_Type NeuronExecution_compute_{nullptr};
+  Neuron_getDeviceCount_Type Neuron_getDeviceCount_{nullptr};
+  Neuron_getDevice_Type Neuron_getDevice_{nullptr};
+  NeuronDevice_getName_Type NeuronDevice_getName_{nullptr};
 };
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index 244467d62492bc3017ebdb6144b49ccb9fcd30c1..88c449e6a9d8b8078802e90dded5db1162459d3f 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -127,8 +127,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       anchor_generator.cc
       split_merge_lod_tenosr.cc
       reduce_prod.cc
+      reduce_sum.cc
       lstm.cc
       clip.cc
       pixel_shuffle.cc
+      scatter.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
index c998ddc3a34c2f6194a5156b7d04b7a9db3fbcef..b4539db98c3ffb1a143c38dd3c4dd9e9924bd63e 100644
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -25,6 +25,73 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+void conv_3x3s1_depthwise_fp32_bias(const float* i_data,
+                                    float* o_data,
+                                    int bs,
+                                    int oc,
+                                    int oh,
+                                    int ow,
+                                    int ic,
+                                    int ih,
+                                    int win,
+                                    const float* weights,
+                                    const float* bias,
+                                    float* relu_ptr,
+                                    float* six_ptr,
+                                    float* scale_ptr,
+                                    const operators::ConvParam& param,
+                                    ARMContext* ctx);
+
+void conv_3x3s1_depthwise_fp32_relu(const float* i_data,
+                                    float* o_data,
+                                    int bs,
+                                    int oc,
+                                    int oh,
+                                    int ow,
+                                    int ic,
+                                    int ih,
+                                    int win,
+                                    const float* weights,
+                                    const float* bias,
+                                    float* relu_ptr,
+                                    float* six_ptr,
+                                    float* scale_ptr,
+                                    const operators::ConvParam& param,
+                                    ARMContext* ctx);
+
+void conv_3x3s1_depthwise_fp32_relu6(const float* i_data,
+                                     float* o_data,
+                                     int bs,
+                                     int oc,
+                                     int oh,
+                                     int ow,
+                                     int ic,
+                                     int ih,
+                                     int win,
+                                     const float* weights,
+                                     const float* bias,
+                                     float* relu_ptr,
+                                     float* six_ptr,
+                                     float* scale_ptr,
+                                     const operators::ConvParam& param,
+                                     ARMContext* ctx);
+
+void conv_3x3s1_depthwise_fp32_leakyRelu(const float* i_data,
+                                         float* o_data,
+                                         int bs,
+                                         int oc,
+                                         int oh,
+                                         int ow,
+                                         int ic,
+                                         int ih,
+                                         int win,
+                                         const float* weights,
+                                         const float* bias,
+                                         float* relu_ptr,
+                                         float* six_ptr,
+                                         float* scale_ptr,
+                                         const operators::ConvParam& param,
+                                         ARMContext* ctx);
 // clang-format off
 #ifdef __aarch64__
 #define COMPUTE \
@@ -335,7 +402,6 @@ namespace math {
           "ldr r0,    [%[outl]]                 @ load outc00 to r0\n" \
           "vmla.f32   q12, q5, q0               @ w8 * inr32\n" \
           "vmla.f32   q13, q5, q1               @ w8 * inr33\n" \
-          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n" \
           "vmla.f32   q14, q5, q2               @ w8 * inr34\n" \
           "vmla.f32   q15, q5, q3               @ w8 * inr35\n" \
           "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n" \
@@ -406,7 +472,6 @@ namespace math {
           "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" \
           "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" \
           "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" \
-          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n" \
           "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" \
           "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" \
           "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" \
@@ -417,12 +482,13 @@ namespace math {
           "vst1.32   {d18-d19}, [r1]  @ save outc10\n" \
           "vst1.32   {d20-d21}, [r2]  @ save outc20\n" \
           "vst1.32   {d22-d23}, [r3]  @ save outc30\n" \
+          "ldr r0,   [%[outl], #20]   @ load outc11 to r5\n" \
+          "ldr r1,   [%[outl], #24]   @ load outc21 to r0\n" \
+          "ldr r2,   [%[outl], #28]   @ load outc31 to r1\n" \
           "vst1.32   {d24-d25}, [r4]  @ save outc01\n" \
-          "vst1.32   {d26-d27}, [r5]  @ save outc11\n" \
-          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n" \
-          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n" \
-          "vst1.32   {d28-d29}, [r0]  @ save outc21\n" \
-          "vst1.32   {d30-d31}, [r1]  @ save outc31\n" \
+          "vst1.32   {d26-d27}, [r0]  @ save outc11\n" \
+          "vst1.32   {d28-d29}, [r1]  @ save outc21\n" \
+          "vst1.32   {d30-d31}, [r2]  @ save outc31\n" \
           "b 3f                       @ branch end\n" \
           "2: \n" \
           "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n" \
@@ -436,291 +502,86 @@ namespace math {
           "3: \n"
 #endif
 // clang-format on
-void act_switch_3x3s1(const float* inr0,
-                      const float* inr1,
-                      const float* inr2,
-                      const float* inr3,
-                      float* out0,
-                      const float* weight_c,
-                      float flag_mask,
-                      void* outl_ptr,
-                      float32x4_t w0,
-                      float32x4_t w1,
-                      float32x4_t w2,
-                      float32x4_t w3,
-                      float32x4_t w4,
-                      float32x4_t w5,
-                      float32x4_t w6,
-                      float32x4_t w7,
-                      float32x4_t w8,
-                      float32x4_t vbias,
-                      const operators::ActivationParam act_param) {
-  bool has_active = act_param.has_active;
-  if (has_active) {
+void conv_3x3s1_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
+                               ARMContext* ctx) {
+  float six_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+  float scale_ptr[4] = {1.f, 1.f, 1.f, 1.f};
+  float relu_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
     switch (act_param.active_type) {
       case lite_api::ActivationType::kRelu:
-#ifdef __aarch64__
-        asm volatile(COMPUTE RELU STORE
-                     : [inr0] "+r"(inr0),
-                       [inr1] "+r"(inr1),
-                       [inr2] "+r"(inr2),
-                       [inr3] "+r"(inr3),
-                       [out] "+r"(out0)
-                     : [w0] "w"(w0),
-                       [w1] "w"(w1),
-                       [w2] "w"(w2),
-                       [w3] "w"(w3),
-                       [w4] "w"(w4),
-                       [w5] "w"(w5),
-                       [w6] "w"(w6),
-                       [w7] "w"(w7),
-                       [w8] "w"(w8),
-                       [vbias] "w"(vbias),
-                       [outl] "r"(outl_ptr),
-                       [flag_mask] "r"(flag_mask)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v15",
-                       "v16",
-                       "v17",
-                       "v18",
-                       "v19",
-                       "v20",
-                       "v21",
-                       "v22",
-                       "x0",
-                       "x1",
-                       "x2",
-                       "x3",
-                       "x4",
-                       "x5",
-                       "x6",
-                       "x7");
-#else
-#if 1  // def LITE_WITH_ARM_CLANG
-#else
-        asm volatile(COMPUTE RELU STORE
-                     : [r0] "+r"(inr0),
-                       [r1] "+r"(inr1),
-                       [r2] "+r"(inr2),
-                       [r3] "+r"(inr3),
-                       [out0] "+r"(out0),
-                       [wc0] "+r"(weight_c)
-                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
-                     : "cc",
-                       "memory",
-                       "q0",
-                       "q1",
-                       "q2",
-                       "q3",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15",
-                       "r0",
-                       "r1",
-                       "r2",
-                       "r3",
-                       "r4",
-                       "r5");
-#endif
-#endif
+        conv_3x3s1_depthwise_fp32_relu(i_data,
+                                       o_data,
+                                       bs,
+                                       oc,
+                                       oh,
+                                       ow,
+                                       ic,
+                                       ih,
+                                       win,
+                                       weights,
+                                       bias,
+                                       relu_ptr,
+                                       six_ptr,
+                                       scale_ptr,
+                                       param,
+                                       ctx);
         break;
       case lite_api::ActivationType::kRelu6:
-#ifdef __aarch64__
-        asm volatile(COMPUTE RELU RELU6 STORE
-                     : [inr0] "+r"(inr0),
-                       [inr1] "+r"(inr1),
-                       [inr2] "+r"(inr2),
-                       [inr3] "+r"(inr3),
-                       [out] "+r"(out0)
-                     : [w0] "w"(w0),
-                       [w1] "w"(w1),
-                       [w2] "w"(w2),
-                       [w3] "w"(w3),
-                       [w4] "w"(w4),
-                       [w5] "w"(w5),
-                       [w6] "w"(w6),
-                       [w7] "w"(w7),
-                       [w8] "w"(w8),
-                       [vbias] "w"(vbias),
-                       [outl] "r"(outl_ptr),
-                       [flag_mask] "r"(flag_mask)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v15",
-                       "v16",
-                       "v17",
-                       "v18",
-                       "v19",
-                       "v20",
-                       "v21",
-                       "v22",
-                       "x0",
-                       "x1",
-                       "x2",
-                       "x3",
-                       "x4",
-                       "x5",
-                       "x6",
-                       "x7");
-#else
-#if 1  // def LITE_WITH_ARM_CLANG
-#else
-        asm volatile(COMPUTE RELU RELU6 STORE
-                     : [r0] "+r"(inr0),
-                       [r1] "+r"(inr1),
-                       [r2] "+r"(inr2),
-                       [r3] "+r"(inr3),
-                       [out0] "+r"(out0),
-                       [wc0] "+r"(weight_c)
-                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
-                     : "cc",
-                       "memory",
-                       "q0",
-                       "q1",
-                       "q2",
-                       "q3",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15",
-                       "r0",
-                       "r1",
-                       "r2",
-                       "r3",
-                       "r4",
-                       "r5");
-#endif
-#endif
+        six_ptr[0] = act_param.Relu_clipped_coef;
+        six_ptr[1] = act_param.Relu_clipped_coef;
+        six_ptr[2] = act_param.Relu_clipped_coef;
+        six_ptr[3] = act_param.Relu_clipped_coef;
+        conv_3x3s1_depthwise_fp32_relu6(i_data,
+                                        o_data,
+                                        bs,
+                                        oc,
+                                        oh,
+                                        ow,
+                                        ic,
+                                        ih,
+                                        win,
+                                        weights,
+                                        bias,
+                                        relu_ptr,
+                                        six_ptr,
+                                        scale_ptr,
+                                        param,
+                                        ctx);
         break;
       case lite_api::ActivationType::kLeakyRelu:
-#ifdef __aarch64__
-        asm volatile(COMPUTE LEAKY_RELU STORE
-                     : [inr0] "+r"(inr0),
-                       [inr1] "+r"(inr1),
-                       [inr2] "+r"(inr2),
-                       [inr3] "+r"(inr3),
-                       [out] "+r"(out0)
-                     : [w0] "w"(w0),
-                       [w1] "w"(w1),
-                       [w2] "w"(w2),
-                       [w3] "w"(w3),
-                       [w4] "w"(w4),
-                       [w5] "w"(w5),
-                       [w6] "w"(w6),
-                       [w7] "w"(w7),
-                       [w8] "w"(w8),
-                       [vbias] "w"(vbias),
-                       [outl] "r"(outl_ptr),
-                       [flag_mask] "r"(flag_mask)
-                     : "cc",
-                       "memory",
-                       "v0",
-                       "v1",
-                       "v2",
-                       "v3",
-                       "v4",
-                       "v5",
-                       "v6",
-                       "v7",
-                       "v8",
-                       "v9",
-                       "v10",
-                       "v11",
-                       "v15",
-                       "v16",
-                       "v17",
-                       "v18",
-                       "v19",
-                       "v20",
-                       "v21",
-                       "v22",
-                       "x0",
-                       "x1",
-                       "x2",
-                       "x3",
-                       "x4",
-                       "x5",
-                       "x6",
-                       "x7");
-#else
-#if 1  // def LITE_WITH_ARM_CLANG
-#else
-        asm volatile(COMPUTE LEAKY_RELU STORE
-                     : [r0] "+r"(inr0),
-                       [r1] "+r"(inr1),
-                       [r2] "+r"(inr2),
-                       [r3] "+r"(inr3),
-                       [out0] "+r"(out0),
-                       [wc0] "+r"(weight_c)
-                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
-                     : "cc",
-                       "memory",
-                       "q0",
-                       "q1",
-                       "q2",
-                       "q3",
-                       "q4",
-                       "q5",
-                       "q6",
-                       "q7",
-                       "q8",
-                       "q9",
-                       "q10",
-                       "q11",
-                       "q12",
-                       "q13",
-                       "q14",
-                       "q15",
-                       "r0",
-                       "r1",
-                       "r2",
-                       "r3",
-                       "r4",
-                       "r5");
-#endif
-#endif
+        scale_ptr[0] = act_param.Leaky_relu_alpha;
+        scale_ptr[1] = act_param.Leaky_relu_alpha;
+        scale_ptr[2] = act_param.Leaky_relu_alpha;
+        scale_ptr[3] = act_param.Leaky_relu_alpha;
+        conv_3x3s1_depthwise_fp32_leakyRelu(i_data,
+                                            o_data,
+                                            bs,
+                                            oc,
+                                            oh,
+                                            ow,
+                                            ic,
+                                            ih,
+                                            win,
+                                            weights,
+                                            bias,
+                                            relu_ptr,
+                                            six_ptr,
+                                            scale_ptr,
+                                            param,
+                                            ctx);
         break;
       default:
         LOG(FATAL) << "this act_type: "
@@ -728,108 +589,289 @@ void act_switch_3x3s1(const float* inr0,
                    << " fuse not support";
     }
   } else {
-#ifdef __aarch64__
-    asm volatile(COMPUTE STORE
-                 : [inr0] "+r"(inr0),
-                   [inr1] "+r"(inr1),
-                   [inr2] "+r"(inr2),
-                   [inr3] "+r"(inr3),
-                   [out] "+r"(out0)
-                 : [w0] "w"(w0),
-                   [w1] "w"(w1),
-                   [w2] "w"(w2),
-                   [w3] "w"(w3),
-                   [w4] "w"(w4),
-                   [w5] "w"(w5),
-                   [w6] "w"(w6),
-                   [w7] "w"(w7),
-                   [w8] "w"(w8),
-                   [vbias] "w"(vbias),
-                   [outl] "r"(outl_ptr),
-                   [flag_mask] "r"(flag_mask)
-                 : "cc",
-                   "memory",
-                   "v0",
-                   "v1",
-                   "v2",
-                   "v3",
-                   "v4",
-                   "v5",
-                   "v6",
-                   "v7",
-                   "v8",
-                   "v9",
-                   "v10",
-                   "v11",
-                   "v15",
-                   "v16",
-                   "v17",
-                   "v18",
-                   "v19",
-                   "v20",
-                   "v21",
-                   "v22",
-                   "x0",
-                   "x1",
-                   "x2",
-                   "x3",
-                   "x4",
-                   "x5",
-                   "x6",
-                   "x7");
-#else
-#if 1  // def LITE_WITH_ARM_CLANG
+    conv_3x3s1_depthwise_fp32_bias(i_data,
+                                   o_data,
+                                   bs,
+                                   oc,
+                                   oh,
+                                   ow,
+                                   ic,
+                                   ih,
+                                   win,
+                                   weights,
+                                   bias,
+                                   relu_ptr,
+                                   six_ptr,
+                                   scale_ptr,
+                                   param,
+                                   ctx);
+  }
+}
+
+void conv_3x3s1_depthwise_fp32_bias(const float* i_data,
+                                    float* o_data,
+                                    int bs,
+                                    int oc,
+                                    int oh,
+                                    int ow,
+                                    int ic,
+                                    int ih,
+                                    int win,
+                                    const float* weights,
+                                    const float* bias,
+                                    float* relu_ptr,
+                                    float* six_ptr,
+                                    float* scale_ptr,
+                                    const operators::ConvParam& param,
+                                    ARMContext* ctx) {
+  int threads = ctx->threads();
+
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+
+  const int out_c_block = 4;
+  const int out_h_kernel = 2;
+  const int out_w_kernel = 4;
+  const int win_ext = ow + 2;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh + 2;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  LOG(INFO) << "conv_3x3s1_depthwise_fp32_bias: ";
+  float* ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
 #else
-    asm volatile(COMPUTE STORE
-                 : [r0] "+r"(inr0),
-                   [r1] "+r"(inr1),
-                   [r2] "+r"(inr2),
-                   [r3] "+r"(inr3),
-                   [out0] "+r"(out0),
-                   [wc0] "+r"(weight_c)
-                 : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
-                 : "cc",
-                   "memory",
-                   "q0",
-                   "q1",
-                   "q2",
-                   "q3",
-                   "q4",
-                   "q5",
-                   "q6",
-                   "q7",
-                   "q8",
-                   "q9",
-                   "q10",
-                   "q11",
-                   "q12",
-                   "q13",
-                   "q14",
-                   "q15",
-                   "r0",
-                   "r1",
-                   "r2",
-                   "r3",
-                   "r4",
-                   "r5");
+      float* pre_din = ptr_write + ow_round;
 #endif
+      /// const array size
+      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+      float32x4_t vbias = vld1q_f32(bias_local);
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc00 = dout_c00 + h * ow;
+        float* outc01 = outc00 + ow;
+        float* outc10 = outc00 + size_out_channel;
+        float* outc11 = outc10 + ow;
+        float* outc20 = outc10 + size_out_channel;
+        float* outc21 = outc20 + ow;
+        float* outc30 = outc20 + size_out_channel;
+        float* outc31 = outc30 + ow;
+        const float* inr0 = pre_din + h * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:  // outc10-outc30 is ptr_write and extra
+              outc10 = ptr_write;
+              outc11 = ptr_write;
+            case 2:  // outc20-outc30 is ptr_write and extra
+              outc20 = ptr_write;
+              outc21 = ptr_write;
+            case 1:  // outc30 is ptr_write and extra
+              outc30 = ptr_write;
+              outc31 = ptr_write;
+            default:
+              break;
+          }
+        }
+        if (h + out_h_kernel > oh) {
+          outc01 = ptr_write;
+          outc11 = ptr_write;
+          outc21 = ptr_write;
+          outc31 = ptr_write;
+        }
+
+        float* outl[] = {outc00,
+                         outc10,
+                         outc20,
+                         outc30,
+                         outc01,
+                         outc11,
+                         outc21,
+                         outc31,
+                         reinterpret_cast<float*>(bias_local),
+                         reinterpret_cast<float*>(relu_ptr),
+                         reinterpret_cast<float*>(six_ptr),
+                         reinterpret_cast<float*>(scale_ptr)};
+        void* outl_ptr = reinterpret_cast<void*>(outl);
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          float* out0 = pre_out;
+#ifdef __aarch64__
+          asm volatile(COMPUTE STORE
+                       : [inr0] "+r"(inr0),
+                         [inr1] "+r"(inr1),
+                         [inr2] "+r"(inr2),
+                         [inr3] "+r"(inr3),
+                         [out] "+r"(out0)
+                       : [w0] "w"(w0),
+                         [w1] "w"(w1),
+                         [w2] "w"(w2),
+                         [w3] "w"(w3),
+                         [w4] "w"(w4),
+                         [w5] "w"(w5),
+                         [w6] "w"(w6),
+                         [w7] "w"(w7),
+                         [w8] "w"(w8),
+                         [vbias] "w"(vbias),
+                         [outl] "r"(outl_ptr),
+                         [flag_mask] "r"(flag_mask)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "x0",
+                         "x1",
+                         "x2",
+                         "x3",
+                         "x4",
+                         "x5",
+                         "x6",
+                         "x7");
+#else
+          asm volatile(COMPUTE STORE
+                       : [r0] "+r"(inr0),
+                         [r1] "+r"(inr1),
+                         [r2] "+r"(inr2),
+                         [r3] "+r"(inr3),
+                         [out0] "+r"(out0),
+                         [wc0] "+r"(weight_c)
+                       : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                       : "cc",
+                         "memory",
+                         "q0",
+                         "q1",
+                         "q2",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15",
+                         "r0",
+                         "r1",
+                         "r2",
+                         "r3",
+                         "r4");
 #endif
+          outl[0] += 4;
+          outl[1] += 4;
+          outl[2] += 4;
+          outl[3] += 4;
+          outl[4] += 4;
+          outl[5] += 4;
+          outl[6] += 4;
+          outl[7] += 4;
+          if (flag_mask) {
+            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
+            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
+            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
+            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
+            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
+            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
+            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
+            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
+          }
+        }
+      }
+    }
   }
 }
-void conv_3x3s1_depthwise_fp32(const float* i_data,
-                               float* o_data,
-                               int bs,
-                               int oc,
-                               int oh,
-                               int ow,
-                               int ic,
-                               int ih,
-                               int win,
-                               const float* weights,
-                               const float* bias,
-                               const operators::ConvParam& param,
-                               const operators::ActivationParam act_param,
-                               ARMContext* ctx) {
+
+void conv_3x3s1_depthwise_fp32_relu(const float* i_data,
+                                    float* o_data,
+                                    int bs,
+                                    int oc,
+                                    int oh,
+                                    int ow,
+                                    int ic,
+                                    int ih,
+                                    int win,
+                                    const float* weights,
+                                    const float* bias,
+                                    float* relu_ptr,
+                                    float* six_ptr,
+                                    float* scale_ptr,
+                                    const operators::ConvParam& param,
+                                    ARMContext* ctx) {
   int threads = ctx->threads();
 
   auto paddings = *param.paddings;
@@ -869,31 +911,6 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
   remain = remain > 0 ? remain : 0;
   int row_len = win_round * out_c_block;
 
-  float six_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-  float scale_ptr[4] = {1.f, 1.f, 1.f, 1.f};
-  float relu_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-  if (act_param.has_active) {
-    switch (act_param.active_type) {
-      case lite_api::ActivationType::kRelu:
-        break;
-      case lite_api::ActivationType::kRelu6:
-        six_ptr[0] = act_param.Relu_clipped_coef;
-        six_ptr[1] = act_param.Relu_clipped_coef;
-        six_ptr[2] = act_param.Relu_clipped_coef;
-        six_ptr[3] = act_param.Relu_clipped_coef;
-        break;
-      case lite_api::ActivationType::kLeakyRelu:
-        scale_ptr[0] = act_param.Leaky_relu_alpha;
-        scale_ptr[1] = act_param.Leaky_relu_alpha;
-        scale_ptr[2] = act_param.Leaky_relu_alpha;
-        scale_ptr[3] = act_param.Leaky_relu_alpha;
-        break;
-      default:
-        LOG(FATAL) << "this act_type: "
-                   << static_cast<int>(act_param.active_type)
-                   << " fuse not support";
-    }
-  }
   for (int n = 0; n < bs; ++n) {
     const float* din_batch = i_data + n * ic * size_in_channel;
     float* dout_batch = o_data + n * oc * size_out_channel;
@@ -944,13 +961,13 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
         const float* inr3 = inr2 + row_len;
         if (c + out_c_block > oc) {
           switch (c + out_c_block - oc) {
-            case 3:
+            case 3:  // outc10-outc30 is ptr_write and extra
               outc10 = ptr_write;
               outc11 = ptr_write;
-            case 2:
+            case 2:  // outc20-outc30 is ptr_write and extra
               outc20 = ptr_write;
               outc21 = ptr_write;
-            case 1:
+            case 1:  // outc30 is ptr_write and extra
               outc30 = ptr_write;
               outc31 = ptr_write;
             default:
@@ -981,48 +998,86 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
           bool flag_mask = (w == w_loop - 1) && flag_remain;
           float* out0 = pre_out;
 #ifdef __aarch64__
-          act_switch_3x3s1(inr0,
-                           inr1,
-                           inr2,
-                           inr3,
-                           out0,
-                           weight_c,
-                           flag_mask,
-                           outl_ptr,
-                           w0,
-                           w1,
-                           w2,
-                           w3,
-                           w4,
-                           w5,
-                           w6,
-                           w7,
-                           w8,
-                           vbias,
-                           act_param);
+          asm volatile(COMPUTE RELU STORE
+                       : [inr0] "+r"(inr0),
+                         [inr1] "+r"(inr1),
+                         [inr2] "+r"(inr2),
+                         [inr3] "+r"(inr3),
+                         [out] "+r"(out0)
+                       : [w0] "w"(w0),
+                         [w1] "w"(w1),
+                         [w2] "w"(w2),
+                         [w3] "w"(w3),
+                         [w4] "w"(w4),
+                         [w5] "w"(w5),
+                         [w6] "w"(w6),
+                         [w7] "w"(w7),
+                         [w8] "w"(w8),
+                         [vbias] "w"(vbias),
+                         [outl] "r"(outl_ptr),
+                         [flag_mask] "r"(flag_mask)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "x0",
+                         "x1",
+                         "x2",
+                         "x3",
+                         "x4",
+                         "x5",
+                         "x6",
+                         "x7");
 #else
-#if 1  // def LITE_WITH_ARM_CLANG
-#else
-          act_switch_3x3s1(inr0,
-                           inr1,
-                           inr2,
-                           inr3,
-                           out0,
-                           weight_c,
-                           flag_mask,
-                           outl_ptr,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           vbias,
-                           act_param);
-#endif
+          asm volatile(COMPUTE RELU STORE
+                       : [r0] "+r"(inr0),
+                         [r1] "+r"(inr1),
+                         [r2] "+r"(inr2),
+                         [r3] "+r"(inr3),
+                         [out0] "+r"(out0),
+                         [wc0] "+r"(weight_c)
+                       : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                       : "cc",
+                         "memory",
+                         "q0",
+                         "q1",
+                         "q2",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15",
+                         "r0",
+                         "r1",
+                         "r2",
+                         "r3",
+                         "r4");
 #endif
           outl[0] += 4;
           outl[1] += 4;
@@ -1032,10 +1087,6 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
           outl[5] += 4;
           outl[6] += 4;
           outl[7] += 4;
-          inr0 += 16;
-          inr1 += 16;
-          inr2 += 16;
-          inr3 += 16;
           if (flag_mask) {
             memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
             memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
@@ -1052,6 +1103,499 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
   }
 }
 
+void conv_3x3s1_depthwise_fp32_relu6(const float* i_data,
+                                     float* o_data,
+                                     int bs,
+                                     int oc,
+                                     int oh,
+                                     int ow,
+                                     int ic,
+                                     int ih,
+                                     int win,
+                                     const float* weights,
+                                     const float* bias,
+                                     float* relu_ptr,
+                                     float* six_ptr,
+                                     float* scale_ptr,
+                                     const operators::ConvParam& param,
+                                     ARMContext* ctx) {
+  int threads = ctx->threads();
+
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+
+  const int out_c_block = 4;
+  const int out_h_kernel = 2;
+  const int out_w_kernel = 4;
+  const int win_ext = ow + 2;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh + 2;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  float* ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+      float32x4_t vbias = vld1q_f32(bias_local);
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc00 = dout_c00 + h * ow;
+        float* outc01 = outc00 + ow;
+        float* outc10 = outc00 + size_out_channel;
+        float* outc11 = outc10 + ow;
+        float* outc20 = outc10 + size_out_channel;
+        float* outc21 = outc20 + ow;
+        float* outc30 = outc20 + size_out_channel;
+        float* outc31 = outc30 + ow;
+        const float* inr0 = pre_din + h * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:  // outc10-outc30 is ptr_write and extra
+              outc10 = ptr_write;
+              outc11 = ptr_write;
+            case 2:  // outc20-outc30 is ptr_write and extra
+              outc20 = ptr_write;
+              outc21 = ptr_write;
+            case 1:  // outc30 is ptr_write and extra
+              outc30 = ptr_write;
+              outc31 = ptr_write;
+            default:
+              break;
+          }
+        }
+        if (h + out_h_kernel > oh) {
+          outc01 = ptr_write;
+          outc11 = ptr_write;
+          outc21 = ptr_write;
+          outc31 = ptr_write;
+        }
+
+        float* outl[] = {outc00,
+                         outc10,
+                         outc20,
+                         outc30,
+                         outc01,
+                         outc11,
+                         outc21,
+                         outc31,
+                         reinterpret_cast<float*>(bias_local),
+                         reinterpret_cast<float*>(relu_ptr),
+                         reinterpret_cast<float*>(six_ptr),
+                         reinterpret_cast<float*>(scale_ptr)};
+        void* outl_ptr = reinterpret_cast<void*>(outl);
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          float* out0 = pre_out;
+#ifdef __aarch64__
+          asm volatile(COMPUTE RELU RELU6 STORE
+                       : [inr0] "+r"(inr0),
+                         [inr1] "+r"(inr1),
+                         [inr2] "+r"(inr2),
+                         [inr3] "+r"(inr3),
+                         [out] "+r"(out0)
+                       : [w0] "w"(w0),
+                         [w1] "w"(w1),
+                         [w2] "w"(w2),
+                         [w3] "w"(w3),
+                         [w4] "w"(w4),
+                         [w5] "w"(w5),
+                         [w6] "w"(w6),
+                         [w7] "w"(w7),
+                         [w8] "w"(w8),
+                         [vbias] "w"(vbias),
+                         [outl] "r"(outl_ptr),
+                         [flag_mask] "r"(flag_mask)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "x0",
+                         "x1",
+                         "x2",
+                         "x3",
+                         "x4",
+                         "x5",
+                         "x6",
+                         "x7");
+#else
+          asm volatile(COMPUTE RELU RELU6 STORE
+                       : [r0] "+r"(inr0),
+                         [r1] "+r"(inr1),
+                         [r2] "+r"(inr2),
+                         [r3] "+r"(inr3),
+                         [out0] "+r"(out0),
+                         [wc0] "+r"(weight_c)
+                       : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                       : "cc",
+                         "memory",
+                         "q0",
+                         "q1",
+                         "q2",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15",
+                         "r0",
+                         "r1",
+                         "r2",
+                         "r3",
+                         "r4");
+#endif
+          outl[0] += 4;
+          outl[1] += 4;
+          outl[2] += 4;
+          outl[3] += 4;
+          outl[4] += 4;
+          outl[5] += 4;
+          outl[6] += 4;
+          outl[7] += 4;
+          if (flag_mask) {
+            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
+            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
+            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
+            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
+            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
+            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
+            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
+            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
+          }
+        }
+      }
+    }
+  }
+}
+
+void conv_3x3s1_depthwise_fp32_leakyRelu(const float* i_data,
+                                         float* o_data,
+                                         int bs,
+                                         int oc,
+                                         int oh,
+                                         int ow,
+                                         int ic,
+                                         int ih,
+                                         int win,
+                                         const float* weights,
+                                         const float* bias,
+                                         float* relu_ptr,
+                                         float* six_ptr,
+                                         float* scale_ptr,
+                                         const operators::ConvParam& param,
+                                         ARMContext* ctx) {
+  int threads = ctx->threads();
+
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+
+  const int out_c_block = 4;
+  const int out_h_kernel = 2;
+  const int out_w_kernel = 4;
+  const int win_ext = ow + 2;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh + 2;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  float* ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+      float32x4_t vbias = vld1q_f32(bias_local);
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc00 = dout_c00 + h * ow;
+        float* outc01 = outc00 + ow;
+        float* outc10 = outc00 + size_out_channel;
+        float* outc11 = outc10 + ow;
+        float* outc20 = outc10 + size_out_channel;
+        float* outc21 = outc20 + ow;
+        float* outc30 = outc20 + size_out_channel;
+        float* outc31 = outc30 + ow;
+        const float* inr0 = pre_din + h * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:  // outc10-outc30 is ptr_write and extra
+              outc10 = ptr_write;
+              outc11 = ptr_write;
+            case 2:  // outc20-outc30 is ptr_write and extra
+              outc20 = ptr_write;
+              outc21 = ptr_write;
+            case 1:  // outc30 is ptr_write and extra
+              outc30 = ptr_write;
+              outc31 = ptr_write;
+            default:
+              break;
+          }
+        }
+        if (h + out_h_kernel > oh) {
+          outc01 = ptr_write;
+          outc11 = ptr_write;
+          outc21 = ptr_write;
+          outc31 = ptr_write;
+        }
+
+        float* outl[] = {outc00,
+                         outc10,
+                         outc20,
+                         outc30,
+                         outc01,
+                         outc11,
+                         outc21,
+                         outc31,
+                         reinterpret_cast<float*>(bias_local),
+                         reinterpret_cast<float*>(relu_ptr),
+                         reinterpret_cast<float*>(six_ptr),
+                         reinterpret_cast<float*>(scale_ptr)};
+        void* outl_ptr = reinterpret_cast<void*>(outl);
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          float* out0 = pre_out;
+#ifdef __aarch64__
+          asm volatile(COMPUTE LEAKY_RELU STORE
+                       : [inr0] "+r"(inr0),
+                         [inr1] "+r"(inr1),
+                         [inr2] "+r"(inr2),
+                         [inr3] "+r"(inr3),
+                         [out] "+r"(out0)
+                       : [w0] "w"(w0),
+                         [w1] "w"(w1),
+                         [w2] "w"(w2),
+                         [w3] "w"(w3),
+                         [w4] "w"(w4),
+                         [w5] "w"(w5),
+                         [w6] "w"(w6),
+                         [w7] "w"(w7),
+                         [w8] "w"(w8),
+                         [vbias] "w"(vbias),
+                         [outl] "r"(outl_ptr),
+                         [flag_mask] "r"(flag_mask)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "x0",
+                         "x1",
+                         "x2",
+                         "x3",
+                         "x4",
+                         "x5",
+                         "x6",
+                         "x7");
+#else
+          asm volatile(COMPUTE LEAKY_RELU STORE
+                       : [r0] "+r"(inr0),
+                         [r1] "+r"(inr1),
+                         [r2] "+r"(inr2),
+                         [r3] "+r"(inr3),
+                         [out0] "+r"(out0),
+                         [wc0] "+r"(weight_c)
+                       : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                       : "cc",
+                         "memory",
+                         "q0",
+                         "q1",
+                         "q2",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15",
+                         "r0",
+                         "r1",
+                         "r2",
+                         "r3",
+                         "r4");
+#endif
+          outl[0] += 4;
+          outl[1] += 4;
+          outl[2] += 4;
+          outl[3] += 4;
+          outl[4] += 4;
+          outl[5] += 4;
+          outl[6] += 4;
+          outl[7] += 4;
+          if (flag_mask) {
+            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
+            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
+            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
+            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
+            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
+            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
+            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
+            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
+          }
+        }
+      }
+    }
+  }
+}
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
index 2bad1f997f457429c013c11a1dce35eb43dc26da..fa2f85311b3ff4247d52505d750566ec80e47256 100644
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -620,8 +620,10 @@ void conv_depthwise_3x3_fp32(const void* din,
   int pad = pad_w;
   bool flag_bias = param.bias != nullptr;
   bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
+  bool ch_four = ch_in <= 4 * w_in;
   if (stride == 1) {
-    if (pads_less && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
+    if (ch_four && pads_less && (pad_h == pad_w) &&
+        (pad < 2)) {  // support pad = [0, 1]
       conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
                                 reinterpret_cast<float*>(dout),
                                 num,
@@ -638,7 +640,6 @@ void conv_depthwise_3x3_fp32(const void* din,
                                 act_param,
                                 ctx);
     } else {
-#ifdef __aarch64__
       conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
                                 reinterpret_cast<float*>(dout),
                                 num,
@@ -653,30 +654,10 @@ void conv_depthwise_3x3_fp32(const void* din,
                                 param,
                                 act_param,
                                 ctx);
-#else
-#ifdef LITE_WITH_ARM_CLANG
-      LOG(FATAL) << "fp32 depthwise conv3x3s1px doesnot support in v7-clang, "
-                    "this can run in basic";
-#else
-      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
-                                reinterpret_cast<float*>(dout),
-                                num,
-                                ch_out,
-                                h_out,
-                                w_out,
-                                ch_in,
-                                h_in,
-                                w_in,
-                                reinterpret_cast<const float*>(weights),
-                                bias,
-                                param,
-                                act_param,
-                                ctx);
-#endif
-#endif
     }
   } else if (stride == 2) {
-    if (pads_less && pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
+    if (ch_four && pads_less && pad_h == pad_w &&
+        (pad < 2)) {  // support pad = [0, 1]
       conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
                                 reinterpret_cast<float*>(dout),
                                 num,
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index 2e52bd1e285b7493148a5a779bffcfcfd1336722..f1ac1d63a1b40e2ead5e976e0bffe6c435a2545b 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -53,7 +53,9 @@
 #include "lite/backends/arm/math/reduce_max.h"
 #include "lite/backends/arm/math/reduce_mean.h"
 #include "lite/backends/arm/math/reduce_prod.h"
+#include "lite/backends/arm/math/reduce_sum.h"
 #include "lite/backends/arm/math/scale.h"
+#include "lite/backends/arm/math/scatter.h"
 #include "lite/backends/arm/math/sequence_expand.h"
 #include "lite/backends/arm/math/sequence_pool.h"
 #include "lite/backends/arm/math/sequence_pool_grad.h"
@@ -357,6 +359,15 @@ inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
   return exp_ps(vmulq_f32(b, log_ps(a)));
 }
 
+inline float32x4_t vpaddq_f32(float32x4_t a, float32x4_t b) {
+  float32x4_t vrst;
+  vrst[0] = a[0] + a[1];
+  vrst[1] = a[2] + a[3];
+  vrst[2] = b[0] + b[1];
+  vrst[3] = b[2] + b[3];
+  return vrst;
+}
+
 template <typename T>
 void fill_bias_fc(
     T* tensor, const T* bias, int num, int channel, bool flag_relu);
diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc
index 1c53142fc53bc785efcbf28fa007d403ad99ab70..4345c2e8137dbe0d0d1031cb4b41a2163d49ed57 100644
--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -70,7 +70,8 @@ void bilinear_interp(const float* src,
                      int h_out,
                      float scale_x,
                      float scale_y,
-                     bool with_align) {
+                     bool align_corners,
+                     bool align_mode) {
   int* buf = new int[w_out + h_out + w_out * 2 + h_out * 2];
 
   int* xofs = buf;
@@ -78,14 +79,13 @@ void bilinear_interp(const float* src,
 
   float* alpha = reinterpret_cast<float*>(buf + w_out + h_out);
   float* beta = reinterpret_cast<float*>(buf + w_out + h_out + w_out * 2);
+  bool with_align = (align_mode == 0 && !align_corners);
 
   float fx = 0.0f;
   float fy = 0.0f;
   int sx = 0;
   int sy = 0;
-  if (with_align) {
-    scale_x = static_cast<float>(w_in - 1) / (w_out - 1);
-    scale_y = static_cast<float>(h_in - 1) / (h_out - 1);
+  if (!with_align) {
     // calculate x axis coordinate
     for (int dx = 0; dx < w_out; dx++) {
       fx = dx * scale_x;
@@ -105,8 +105,6 @@ void bilinear_interp(const float* src,
       beta[dy * 2 + 1] = fy;
     }
   } else {
-    scale_x = static_cast<float>(w_in) / w_out;
-    scale_y = static_cast<float>(h_in) / h_out;
     // calculate x axis coordinate
     for (int dx = 0; dx < w_out; dx++) {
       fx = scale_x * (dx + 0.5f) - 0.5f;
@@ -468,15 +466,9 @@ void nearest_interp(const float* src,
                     float* dst,
                     int w_out,
                     int h_out,
-                    float scale_x,
-                    float scale_y,
+                    float scale_w_new,
+                    float scale_h_new,
                     bool with_align) {
-  float scale_w_new = (with_align)
-                          ? (static_cast<float>(w_in - 1) / (w_out - 1))
-                          : (static_cast<float>(w_in) / (w_out));
-  float scale_h_new = (with_align)
-                          ? (static_cast<float>(h_in - 1) / (h_out - 1))
-                          : (static_cast<float>(h_in) / (h_out));
   if (with_align) {
     for (int h = 0; h < h_out; ++h) {
       float* dst_p = dst + h * w_out;
@@ -506,7 +498,8 @@ void interpolate(lite::Tensor* X,
                  int out_height,
                  int out_width,
                  float scale,
-                 bool with_align,
+                 bool align_corners,
+                 bool align_mode,
                  std::string interpolate_type) {
   int in_h = X->dims()[2];
   int in_w = X->dims()[3];
@@ -531,12 +524,12 @@ void interpolate(lite::Tensor* X,
       out_width = out_size_data[1];
     }
   }
-  float height_scale = scale;
-  float width_scale = scale;
-  if (out_width > 0 && out_height > 0) {
-    height_scale = static_cast<float>(out_height / X->dims()[2]);
-    width_scale = static_cast<float>(out_width / X->dims()[3]);
-  }
+  // float height_scale = scale;
+  // float width_scale = scale;
+  // if (out_width > 0 && out_height > 0) {
+  //   height_scale = static_cast<float>(out_height / X->dims()[2]);
+  //   width_scale = static_cast<float>(out_width / X->dims()[3]);
+  // }
   int num_cout = X->dims()[0];
   int c_cout = X->dims()[1];
   Out->Resize({num_cout, c_cout, out_height, out_width});
@@ -551,6 +544,10 @@ void interpolate(lite::Tensor* X,
   int spatial_in = in_h * in_w;
   int spatial_out = out_h * out_w;
 
+  float scale_x = (align_corners) ? (static_cast<float>(in_w - 1) / (out_w - 1))
+                                  : (static_cast<float>(in_w) / (out_w));
+  float scale_y = (align_corners) ? (static_cast<float>(in_h - 1) / (out_h - 1))
+                                  : (static_cast<float>(in_h) / (out_h));
   if ("Bilinear" == interpolate_type) {
 #pragma omp parallel for
     for (int i = 0; i < count; ++i) {
@@ -560,9 +557,10 @@ void interpolate(lite::Tensor* X,
                       dout + spatial_out * i,
                       out_w,
                       out_h,
-                      1.f / width_scale,
-                      1.f / height_scale,
-                      with_align);
+                      scale_x,
+                      scale_y,
+                      align_corners,
+                      align_mode);
     }
   } else if ("Nearest" == interpolate_type) {
 #pragma omp parallel for
@@ -573,9 +571,9 @@ void interpolate(lite::Tensor* X,
                      dout + spatial_out * i,
                      out_w,
                      out_h,
-                     1.f / width_scale,
-                     1.f / height_scale,
-                     with_align);
+                     scale_x,
+                     scale_y,
+                     align_corners);
     }
   }
 }
diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h
index e9c41c5bc86c8f00d57e096e3cd2b5f37df3a474..82c4c068b69567c01d37cfa901f9b58626574865 100644
--- a/lite/backends/arm/math/interpolate.h
+++ b/lite/backends/arm/math/interpolate.h
@@ -30,7 +30,8 @@ void bilinear_interp(const float* src,
                      int h_out,
                      float scale_x,
                      float scale_y,
-                     bool with_align);
+                     bool align_corners,
+                     bool align_mode);
 
 void nearest_interp(const float* src,
                     int w_in,
@@ -40,7 +41,7 @@ void nearest_interp(const float* src,
                     int h_out,
                     float scale_x,
                     float scale_y,
-                    bool with_align);
+                    bool align_corners);
 
 void interpolate(lite::Tensor* X,
                  lite::Tensor* OutSize,
@@ -50,7 +51,8 @@ void interpolate(lite::Tensor* X,
                  int out_height,
                  int out_width,
                  float scale,
-                 bool with_align,
+                 bool align_corners,
+                 bool align_mode,
                  std::string interpolate_type);
 
 } /* namespace math */
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
old mode 100644
new mode 100755
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index c3652217ededa10b57e211ba7f5d3dc76e235978..1817e934cc460fdff6f18ec7491838ff1a5ce640 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -2224,7 +2224,13 @@ void pooling3x3s2p1_max(const float* din,
     w_unroll_size -= 1;
     w_unroll_remian = wout - w_unroll_size * 4;
   }
-  float32x4_t vmin = vdupq_n_f32(std::numeric_limits<float>::lowest());
+  int w_needed = wout * 2 + 1;
+  int need_right = w_needed - win - pad_right;
+  int w_2 = need_right > 0 ? w_unroll_remian : w_unroll_remian + 1;
+  w_2 = w_unroll_size <= 0 ? w_2 - 1 : w_2;
+  need_right = wout > 1 ? need_right : 0;
+  float minval = std::numeric_limits<float>::lowest();
+  float32x4_t vmin = vdupq_n_f32(minval);
 
   for (int n = 0; n < num; ++n) {
     float* data_out_batch = data_out + n * chout * size_channel_out;
@@ -2263,6 +2269,11 @@ void pooling3x3s2p1_max(const float* din,
               break;
           }
         }
+
+        auto pr0 = dr0;
+        auto pr1 = dr1;
+        auto pr2 = dr2;
+
         int cnt_num = w_unroll_size;
         if (w_unroll_size > 0) {
 #ifdef __aarch64__
@@ -2316,27 +2327,60 @@ void pooling3x3s2p1_max(const float* din,
                 "q11",
                 "q15");
 #endif
+
           dr0 -= 8;
           dr1 -= 8;
           dr2 -= 8;
-        }
-        // deal with right pad
-        int wstart = w_unroll_size * 4 * S - P;
-        for (int j = 0; j < w_unroll_remian; ++j) {
-          int wend = std::min(wstart + K, win);
-          int st = wstart > 0 ? wstart : 0;
-          float tmp = dr0[0];
-          for (int i = 0; i < wend - st; i++) {
+        } else {
+          float tmp = minval;
+          int left_ = std::min(2, win);
+          for (int i = 0; i < left_; i++) {
             tmp = std::max(tmp, dr0[i]);
             tmp = std::max(tmp, dr1[i]);
             tmp = std::max(tmp, dr2[i]);
           }
-          *(dr_out++) = tmp;
-          dr0 += S - (st - wstart);
-          dr1 += S - (st - wstart);
-          dr2 += S - (st - wstart);
-          wstart += S;
+
+          dr_out[0] = tmp;
+          dr0++;
+          dr1++;
+          dr2++;
+          dr_out++;
         }
+
+        for (int w = 0; w < w_2 - 1; w += 1) {
+          float32x4_t vr0 = vld1q_f32(dr0);
+          float32x4_t vr1 = vld1q_f32(dr1);
+          float32x4_t vr2 = vld1q_f32(dr2);
+          vr0 = vsetq_lane_f32(minval, vr0, 3);
+          vr1 = vsetq_lane_f32(minval, vr1, 3);
+          vr2 = vsetq_lane_f32(minval, vr2, 3);
+          float32x4_t vmax1 = vmaxq_f32(vr0, vr1);
+          vmax1 = vmaxq_f32(vmax1, vr2);
+          float32x2_t vmax2 =
+              vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
+          float32x2_t vmax = vpmax_f32(vmax2, vmax2);
+          dr_out[0] = vget_lane_f32(vmax, 0);
+          dr_out++;
+
+          dr0 += 2;
+          dr1 += 2;
+          dr2 += 2;
+        }
+
+        if (need_right) {
+          float tmp = minval;
+          int idx = win - 1;
+          tmp = std::max(tmp, std::max(pr0[idx], pr1[idx]));
+          tmp = std::max(tmp, pr2[idx]);
+          dr_out[0] = tmp;
+          if (win % 2) {
+            idx = win - 2;
+            tmp = std::max(tmp, std::max(pr0[idx], pr1[idx]));
+            tmp = std::max(tmp, pr2[idx]);
+            dr_out[0] = tmp;
+          }
+        }
+
         data_out_channel += wout;
       }
     }
@@ -2573,6 +2617,7 @@ void pooling3x3s2p0_max(const float* din,
   int wend = std::min(tmp_val + K, win) - tmp_val;
   float minval = std::numeric_limits<float>::lowest();
   remain = right > 0 ? remain : remain + 1;
+
   for (int n = 0; n < num; ++n) {
     float* data_out_batch = data_out + n * chout * size_channel_out;
     const float* data_in_batch = data_in + n * chin * size_channel_in;
@@ -2663,13 +2708,14 @@ void pooling3x3s2p0_max(const float* din,
               vpmax_f32(vget_low_f32(vmax1), vget_high_f32(vmax1));
           float32x2_t vmax = vpmax_f32(vmax2, vmax2);
           dr_out[0] = vget_lane_f32(vmax, 0);
+
           dr_out++;
           dr0 += 2;
           dr1 += 2;
           dr2 += 2;
         }
-        if (right) {
-          float tmp = dr0[0];  // std::numeric_limits<float>::min();
+        if (right > 0) {
+          float tmp = dr0[0];
           for (int i = 0; i < wend; i++) {
             tmp = std::max(tmp, std::max(dr0[i], dr1[i]));
             tmp = std::max(tmp, dr2[i]);
diff --git a/lite/backends/arm/math/reduce_sum.cc b/lite/backends/arm/math/reduce_sum.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b563887e8619e29e40d85699b6979713aae8c0a2
--- /dev/null
+++ b/lite/backends/arm/math/reduce_sum.cc
@@ -0,0 +1,385 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/arm/math/reduce_sum.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void reduce_sum_n<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int chw_size = channel_in * height_in * width_in;
+  if (num_in == 1) {
+    memcpy(dst, src, sizeof(float) * chw_size);
+  } else {
+    int cnt_n = num_in >> 2;
+    int remain_n = num_in & 3;
+    int cnt_chw = chw_size >> 3;
+    int cnt_rem = chw_size & 7;
+    int stride = chw_size << 2;
+    int stride_c = 0;
+    for (int c = 0; c < cnt_chw; c++) {
+      float32x4_t vsum0 = vdupq_n_f32(0.f);
+      float32x4_t vsum1 = vdupq_n_f32(0.f);
+      const float* din_ptr0 = src + stride_c;
+      const float* din_ptr1 = din_ptr0 + chw_size;
+      const float* din_ptr2 = din_ptr1 + chw_size;
+      const float* din_ptr3 = din_ptr2 + chw_size;
+      for (int n = 0; n < cnt_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        float32x4_t vb0 = vld1q_f32(din_ptr1);
+        float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+        float32x4_t vb1 = vld1q_f32(din_ptr1 + 4);
+        float32x4_t vc0 = vld1q_f32(din_ptr2);
+        float32x4_t vd0 = vld1q_f32(din_ptr3);
+        float32x4_t vs00 = vaddq_f32(va0, vb0);
+        float32x4_t vc1 = vld1q_f32(din_ptr2 + 4);
+        float32x4_t vs10 = vaddq_f32(va1, vb1);
+        float32x4_t vd1 = vld1q_f32(din_ptr3 + 4);
+        float32x4_t vs01 = vaddq_f32(vc0, vd0);
+        vsum0 = vaddq_f32(vsum0, vs00);
+        float32x4_t vs11 = vaddq_f32(vc1, vd1);
+        vsum1 = vaddq_f32(vsum1, vs10);
+        din_ptr0 += stride;
+        din_ptr1 += stride;
+        vsum0 = vaddq_f32(vsum0, vs01);
+        din_ptr2 += stride;
+        din_ptr3 += stride;
+        vsum1 = vaddq_f32(vsum1, vs11);
+      }
+      for (int n = 0; n < remain_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+        vsum0 = vaddq_f32(vsum0, va0);
+        din_ptr0 += chw_size;
+        vsum1 = vaddq_f32(vsum1, va1);
+      }
+      vst1q_f32(dst, vsum0);
+      dst += 4;
+      stride_c += 8;
+      vst1q_f32(dst, vsum1);
+      dst += 4;
+    }
+    if (cnt_rem > 3) {
+      float32x4_t vsum0 = vdupq_n_f32(0.f);
+      const float* din_ptr0 = src + stride_c;
+      const float* din_ptr1 = din_ptr0 + chw_size;
+      const float* din_ptr2 = din_ptr1 + chw_size;
+      const float* din_ptr3 = din_ptr2 + chw_size;
+      for (int n = 0; n < cnt_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        float32x4_t vb0 = vld1q_f32(din_ptr1);
+        float32x4_t vc0 = vld1q_f32(din_ptr2);
+        float32x4_t vd0 = vld1q_f32(din_ptr3);
+        float32x4_t vs00 = vaddq_f32(va0, vb0);
+        float32x4_t vs01 = vaddq_f32(vc0, vd0);
+        vsum0 = vaddq_f32(vsum0, vs00);
+        din_ptr0 += stride;
+        din_ptr1 += stride;
+        vsum0 = vaddq_f32(vsum0, vs01);
+        din_ptr2 += stride;
+        din_ptr3 += stride;
+      }
+      for (int n = 0; n < remain_n; n++) {
+        float32x4_t va0 = vld1q_f32(din_ptr0);
+        din_ptr0 += chw_size;
+        vsum0 = vaddq_f32(vsum0, va0);
+      }
+      stride_c += 4;
+      vst1q_f32(dst, vsum0);
+      dst += 4;
+      cnt_rem -= 4;
+    }
+    for (int c = 0; c < cnt_rem; c++) {
+      const float* din_ptr0 = src + stride_c;
+      const float* din_ptr1 = din_ptr0 + chw_size;
+      const float* din_ptr2 = din_ptr1 + chw_size;
+      const float* din_ptr3 = din_ptr2 + chw_size;
+      float sum = 0.0;
+      for (int n = 0; n < cnt_n; n++) {
+        float tmp0 = din_ptr0[0] + din_ptr1[0];
+        float tmp1 = din_ptr2[0] + din_ptr3[0];
+        din_ptr0 += stride;
+        din_ptr1 += stride;
+        sum += tmp0;
+        din_ptr2 += stride;
+        din_ptr3 += stride;
+        sum += tmp1;
+      }
+      for (int n = 0; n < remain_n; n++) {
+        sum += din_ptr0[0];
+        din_ptr0 += chw_size;
+      }
+      stride_c++;
+      dst[0] = sum;
+      dst++;
+    }
+  }
+}
+
+template <>
+void reduce_sum_c<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  for (int n = 0; n < num_in; ++n) {
+    reduce_sum_n<float>(src, dst, channel_in, 1, height_in, width_in);
+    src += chw_size;
+    dst += hw_size;
+  }
+}
+
+template <>
+void reduce_sum_h<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int nc_size = num_in * channel_in;
+  int hw_size = height_in * width_in;
+  for (int n = 0; n < nc_size; ++n) {
+    reduce_sum_n<float>(src, dst, height_in, 1, 1, width_in);
+    src += hw_size;
+    dst += width_in;
+  }
+}
+
+template <>
+void reduce_sum_w<float>(const float* src,
+                         float* dst,
+                         int num_in,
+                         int channel_in,
+                         int height_in,
+                         int width_in) {
+  int nch_size = num_in * channel_in * height_in;
+  int cnt_w = width_in >> 3;
+  int cnt_n = nch_size >> 2;
+  int rem_w = width_in & 7;
+  int rem_n = nch_size & 3;
+  int stride = 0;
+  int stride_n = width_in << 2;
+  for (int n = 0; n < cnt_n; n++) {
+    const float* din_ptr0 = src + stride;
+    const float* din_ptr1 = din_ptr0 + width_in;
+    const float* din_ptr2 = din_ptr1 + width_in;
+    const float* din_ptr3 = din_ptr2 + width_in;
+    float32x4_t vsum = vdupq_n_f32(0.f);
+    int tmp = rem_w;
+    for (int w = 0; w < cnt_w; w++) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      float32x4_t vb1 = vld1q_f32(din_ptr1 + 4);
+      float32x4_t vc0 = vld1q_f32(din_ptr2);
+      float32x4_t vc1 = vld1q_f32(din_ptr2 + 4);
+      float32x4_t vs0 = vaddq_f32(va0, va1);
+      float32x4_t vd0 = vld1q_f32(din_ptr3);
+      float32x4_t vs1 = vaddq_f32(vb0, vb1);
+      float32x4_t vd1 = vld1q_f32(din_ptr3 + 4);
+      float32x4_t vs2 = vaddq_f32(vc0, vc1);
+      din_ptr0 += 8;
+      float32x4_t vs3 = vaddq_f32(vd0, vd1);
+      din_ptr1 += 8;
+      float32x4_t vs00 = vpaddq_f32(vs0, vs1);
+      din_ptr2 += 8;
+      float32x4_t vs01 = vpaddq_f32(vs2, vs3);
+      din_ptr3 += 8;
+      float32x4_t vs = vpaddq_f32(vs00, vs01);
+      vsum = vaddq_f32(vs, vsum);
+    }
+    if (tmp > 3) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      float32x4_t vc0 = vld1q_f32(din_ptr2);
+      float32x4_t vd0 = vld1q_f32(din_ptr3);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      float32x4_t vs00 = vpaddq_f32(va0, vb0);
+      float32x4_t vs01 = vpaddq_f32(vc0, vd0);
+      din_ptr2 += 4;
+      din_ptr3 += 4;
+      float32x4_t vs = vpaddq_f32(vs00, vs01);
+      vsum = vaddq_f32(vs, vsum);
+      tmp -= 4;
+    }
+    for (int w = 0; w < tmp; w++) {
+      vsum[0] += *din_ptr0++;
+      vsum[1] += *din_ptr1++;
+      vsum[2] += *din_ptr2++;
+      vsum[3] += *din_ptr3++;
+    }
+    stride += stride_n;
+    vst1q_f32(dst, vsum);
+    dst += 4;
+  }
+  if (rem_n > 1) {
+    const float* din_ptr0 = src + stride;
+    const float* din_ptr1 = din_ptr0 + width_in;
+    float32x4_t vsum = vdupq_n_f32(0.f);
+    for (int w = 0; w < cnt_w; w++) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      din_ptr0 += 4;
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      din_ptr1 += 4;
+      float32x4_t va1 = vld1q_f32(din_ptr0);
+      float32x4_t vb1 = vld1q_f32(din_ptr1);
+      float32x4_t vs0 = vpaddq_f32(va0, vb0);
+      din_ptr0 += 4;
+      float32x4_t vs1 = vpaddq_f32(va1, vb1);
+      din_ptr1 += 4;
+      float32x4_t vs00 = vpaddq_f32(vs0, vs1);
+      vsum = vaddq_f32(vs00, vsum);
+    }
+    int tmp = rem_w;
+    if (tmp > 3) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t vb0 = vld1q_f32(din_ptr1);
+      din_ptr0 += 4;
+      din_ptr1 += 4;
+      float32x4_t vs00 = vpaddq_f32(va0, vb0);
+      tmp -= 4;
+      vsum[0] += vs00[0];
+      vsum[2] += vs00[1];
+      vsum[1] += vs00[2];
+      vsum[3] += vs00[3];
+    }
+    vsum[0] += vsum[2];
+    vsum[1] += vsum[3];
+    for (int w = 0; w < tmp; w++) {
+      vsum[0] += *din_ptr0++;
+      vsum[1] += *din_ptr1++;
+    }
+    stride += width_in;
+    *dst++ = vsum[0];
+    stride += width_in;
+    *dst++ = vsum[1];
+    rem_n -= 2;
+  }
+  for (int n = 0; n < rem_n; n++) {
+    const float* din_ptr0 = src + stride;
+    float32x4_t vsum = vdupq_n_f32(0.f);
+    for (int w = 0; w < cnt_w; w++) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      float32x4_t va1 = vld1q_f32(din_ptr0 + 4);
+      float32x4_t vs0 = vaddq_f32(va0, va1);
+      din_ptr0 += 8;
+      vsum = vaddq_f32(vs0, vsum);
+    }
+    if (rem_w > 3) {
+      float32x4_t va0 = vld1q_f32(din_ptr0);
+      din_ptr0 += 4;
+      vsum = vaddq_f32(vsum, va0);
+      rem_w -= 4;
+    }
+    vsum[1] += vsum[2];
+    for (int w = 0; w < rem_w; w++) {
+      vsum[0] += *din_ptr0++;
+    }
+    vsum[1] += vsum[3];
+    vsum[0] += vsum[1];
+    *dst++ = vsum[0];
+  }
+}
+
+template <>
+void reduce_sum_all<float>(const float* src, float* dst, int all_size) {
+  int cnt_n = all_size >> 4;
+  int rem_n = all_size & 15;
+  int cnt_rem = rem_n >> 2;
+  int rem_rem = rem_n & 3;
+  float32x4_t vsum = vdupq_n_f32(0.f);
+  for (int n = 0; n < cnt_n; n++) {
+    float32x4_t va0 = vld1q_f32(src);
+    float32x4_t va1 = vld1q_f32(src + 4);
+    float32x4_t va2 = vld1q_f32(src + 8);
+    float32x4_t va3 = vld1q_f32(src + 12);
+    src += 16;
+    float32x4_t vs0 = vaddq_f32(va0, va1);
+    float32x4_t vs1 = vaddq_f32(va2, va3);
+    float32x4_t vs = vpaddq_f32(vs0, vs1);
+    vsum = vaddq_f32(vsum, vs);
+  }
+  for (int n = 0; n < cnt_rem; n++) {
+    float32x4_t va0 = vld1q_f32(src);
+    src += 4;
+    vsum = vaddq_f32(vsum, va0);
+  }
+  vsum[1] += vsum[2];
+  for (int n = 0; n < rem_rem; n++) {
+    vsum[0] += *src++;
+  }
+  vsum[1] += vsum[3];
+  vsum[0] += vsum[1];
+  dst[0] = vsum[0];
+}
+
+template <>
+void reduce_sum_nc<float>(const float* src,
+                          float* dst,
+                          int num_in,
+                          int channel_in,
+                          int height_in,
+                          int width_in) {
+  // reduce nc.
+  int num = num_in * channel_in;
+  int size = height_in * width_in;
+  reduce_sum_n(src, dst, num, size, 1, 1);
+}
+
+template <>
+void reduce_sum_ch<float>(const float* src,
+                          float* dst,
+                          int num_in,
+                          int channel_in,
+                          int height_in,
+                          int width_in) {
+  int ch_size = channel_in * height_in;
+  int chw_size = ch_size * width_in;
+  for (int n = 0; n < num_in; n++) {
+    reduce_sum_n<float>(src, dst, ch_size, 1, 1, width_in);
+    src += chw_size;
+    dst += width_in;
+  }
+}
+
+template <>
+void reduce_sum_hw<float>(const float* src,
+                          float* dst,
+                          int num_in,
+                          int channel_in,
+                          int height_in,
+                          int width_in) {
+  int hw_size = height_in * width_in;
+  int nc_size = num_in * channel_in;
+  reduce_sum_w(src, dst, nc_size, 1, 1, hw_size);
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_sum.h b/lite/backends/arm/math/reduce_sum.h
new file mode 100644
index 0000000000000000000000000000000000000000..74e0b6dc75d17ca5a79c4b46c8535c7f30ec1c08
--- /dev/null
+++ b/lite/backends/arm/math/reduce_sum.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void reduce_sum_n(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_c(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_h(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_w(const T* src,
+                  T* dst,
+                  int num_in,
+                  int channel_in,
+                  int height_in,
+                  int width_in);
+
+template <typename T>
+void reduce_sum_nc(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in);
+
+template <typename T>
+void reduce_sum_ch(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in);
+
+template <typename T>
+void reduce_sum_hw(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in);
+
+template <typename T>
+void reduce_sum_all(const T* src, T* dst, int all_size);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/scatter.cc b/lite/backends/arm/math/scatter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9250a9bfa3fcfbdac2a8942aeff3bd28b4bc381
--- /dev/null
+++ b/lite/backends/arm/math/scatter.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/arm/math/scatter.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <>
+void scatter<float>(const int64_t* indexs,
+                    const float* src,
+                    float* dst,
+                    int index_size,
+                    int num,
+                    int size,
+                    bool overwrite) {
+  for (int i = 0; i < num; i++) {
+    const float* din = src + indexs[i] * size;
+    memcpy(dst, din, sizeof(float) * size);
+    dst += size;
+  }
+  if (overwrite) {
+    for (int i = num; i < index_size; i++) {
+      const float* din = src + indexs[i] * size;
+      float* dout = dst + indexs[i] * size;
+      memcpy(dout, din, sizeof(float) * size);
+    }
+  } else {
+    int cnt = size >> 3;
+    int rem = size & 7;
+    for (int i = num; i < index_size; i++) {
+      const float* din = src + indexs[i] * size;
+      float* dout = dst + indexs[i] * size;
+      for (int j = 0; j < cnt; j++) {
+        float32x4_t va0 = vld1q_f32(din);
+        float32x4_t vb0 = vld1q_f32(dout);
+        float32x4_t va1 = vld1q_f32(din + 4);
+        float32x4_t vb1 = vld1q_f32(dout + 4);
+        vb0 = vaddq_f32(va0, vb0);
+        vb1 = vaddq_f32(va1, vb1);
+        din += 8;
+        vst1q_f32(dout, vb0);
+        vst1q_f32(dout + 4, vb0);
+        dout += 8;
+      }
+      for (int j = 0; j < rem; j++) {
+        dout[0] += *din++;
+        dout++;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/mobile/src/fpga/KD/dl_engine.hpp b/lite/backends/arm/math/scatter.h
similarity index 61%
rename from mobile/src/fpga/KD/dl_engine.hpp
rename to lite/backends/arm/math/scatter.h
index 861d7231dc745c90b415eba5757bdc6957290273..3d145367189eb61e7fdfbd5b20a55f5397ae702b 100644
--- a/mobile/src/fpga/KD/dl_engine.hpp
+++ b/lite/backends/arm/math/scatter.h
@@ -13,21 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <stdio.h>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class DLEngine {
- public:
-  static DLEngine& get_instance() {
-    static DLEngine s_instance;
-    return s_instance;
-  }
-
- private:
-  DLEngine();
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
+#include <stdint.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void scatter(const int64_t* indexs,
+             const T* updates,
+             T* dst,
+             int index_size,
+             int num,
+             int size,
+             bool overwrite);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/bm/target_wrapper.cc b/lite/backends/bm/target_wrapper.cc
index 6dab2a574d9c270573c00688768ad45a767abeae..83aa4dc8c1a6462bfd38a1c59f438e4836a3da00 100644
--- a/lite/backends/bm/target_wrapper.cc
+++ b/lite/backends/bm/target_wrapper.cc
@@ -23,7 +23,7 @@ int TargetWrapperBM::device_id_ = 0;
 std::map<int, void*> TargetWrapperBM::bm_hds_;
 
 size_t TargetWrapperBM::num_devices() {
-  int count = 0;
+  int count = 1;
   bm_status_t ret = bm_dev_getcount(&count);
   CHECK_EQ(ret, BM_SUCCESS) << "Failed with error code: "
                             << static_cast<int>(ret);
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 6fbdc21f934f21dd26c3eb66885f7087e3d340c0..7d86730b93e9e71c32d9f25c2ab0406715f6cdec 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -48,7 +48,7 @@ __kernel void depth_conv2d_3x3(
   int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
 
   int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+      ouput_pos_in_one_block * stride_xy + (int2)(offset + dilation - 1, offset + dilation - 1);
 
 #ifdef BIASE_CH
   CL_DTYPE4 output =
@@ -77,13 +77,13 @@ __kernel void depth_conv2d_3x3(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                 << 15));
 
   inputs[1] = select(
@@ -91,45 +91,37 @@ __kernel void depth_conv2d_3x3(
                     input,
                     sampler,
                     (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 ||
                  in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+                 in_pos_in_one_block.y - dilation >= input_height)
                 << 15));
 
   inputs[2] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y - dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y - dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y - dilation >= input_height)
                 << 15));
 
   inputs[3] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
                            pos_in_input_block.y + in_pos_in_one_block.y)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
                  in_pos_in_one_block.y >= input_height)
                 << 15));
-  /*
-  if (output_pos.x == 112 && output_pos.y == 0) {
-        CL_DTYPE4 input1 = inputs[3];
-        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-        printf(" input4 3 - %v4hlf \n", in);
-        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-  }
-  */
 
   inputs[4] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
@@ -147,11 +139,11 @@ __kernel void depth_conv2d_3x3(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
                            pos_in_input_block.y + in_pos_in_one_block.y)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
                  in_pos_in_one_block.y >= input_height)
                 << 15));
 
@@ -159,13 +151,13 @@ __kernel void depth_conv2d_3x3(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x - dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x - dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                 << 15));
 
   inputs[7] = select(
@@ -173,24 +165,24 @@ __kernel void depth_conv2d_3x3(
                     input,
                     sampler,
                     (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
+      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 ||
                  in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+                 in_pos_in_one_block.y + dilation >= input_height)
                 << 15));
 
   inputs[8] = select(
       READ_IMG_TYPE(CL_DTYPE_CHAR,
                     input,
                     sampler,
-                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                           pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                    (int2)(pos_in_input_block.x + in_pos_in_one_block.x + dilation,
+                           pos_in_input_block.y + in_pos_in_one_block.y + dilation)),
       (CL_DTYPE4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                 in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
+      (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
+                 in_pos_in_one_block.y + dilation < 0 ||
+                 in_pos_in_one_block.x + dilation >= input_width ||
+                 in_pos_in_one_block.y + dilation >= input_height)
                 << 15));
 
   CL_DTYPE4 filters[9];
@@ -221,14 +213,18 @@ __kernel void depth_conv2d_3x3(
 
   /*
 
-  if (output_pos.x == 112 && output_pos.y == 0) {
+  if (output_pos.x == 0 && output_pos.y == 0) {
 
       for (int i = 0; i < 9; ++i) {
           CL_DTYPE4 input1 = inputs[i];
           float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-          printf(" input4 %d - %v4hlf \n", i, in);
+          printf(" input4[%d]: %v4hlf \n", i, in);
+      }
+      for (int i = 0; i < 9; ++i) {
+          CL_DTYPE4 filters1 = filters[i];
+          float4 f = (float4)(filters1.x, filters1.y, filters1.z, filters1.w);
+          printf(" weights4[%d]: %v4hlf \n", i, f);
       }
-
       float4 out = (float4)(output.x, output.y, output.z, output.w);
       printf(" depth wise output output4 = %v4hlf \n", out);
       printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
diff --git a/lite/backends/x86/cpu_info.cc b/lite/backends/x86/cpu_info.cc
index 276b62654f3c8b25d23e629c706e4877dabc3889..3ba8dc50783b2118564fc24f802053e4d414aace 100644
--- a/lite/backends/x86/cpu_info.cc
+++ b/lite/backends/x86/cpu_info.cc
@@ -24,6 +24,7 @@
 #include <sys/types.h>
 #elif defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#define GLOG_NO_ABBREVIATED_SEVERITIES
 #include <windows.h>
 #else
 #include <unistd.h>
diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt
index a89107632341cf063ac3166aa9890ff383e3383f..b5262efa4e8ca3fbfa3076fb9a5eb6fe1993ccb2 100644
--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -61,3 +61,5 @@ math_library(search_fc DEPS blas dynload_mklml)
 # cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 # cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 # cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
+math_library(box_coder DEPS math_function)
+math_library(prior_box DEPS math_function)
diff --git a/lite/backends/x86/math/box_coder.cc b/lite/backends/x86/math/box_coder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efe3c14fdad1ab529262731316c048e4238cd223
--- /dev/null
+++ b/lite/backends/x86/math/box_coder.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/box_coder.h"
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void encode_center_size(const int64_t row,  // N
+                        const int64_t col,  // M
+                        const int64_t len,  // 4
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      size_t offset = i * col * len + j * len;
+      float prior_box_width = prior_box_data[j * len + 2] -
+                              prior_box_data[j * len] + (normalized == false);
+      float prior_box_height = prior_box_data[j * len + 3] -
+                               prior_box_data[j * len + 1] +
+                               (normalized == false);
+      float prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[j * len + 1] + prior_box_height / 2;
+
+      float target_box_center_x =
+          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
+      float target_box_center_y =
+          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
+      float target_box_width = target_box_data[i * len + 2] -
+                               target_box_data[i * len] + (normalized == false);
+      float target_box_height = target_box_data[i * len + 3] -
+                                target_box_data[i * len + 1] +
+                                (normalized == false);
+
+      output[offset] =
+          (target_box_center_x - prior_box_center_x) / prior_box_width;
+      output[offset + 1] =
+          (target_box_center_y - prior_box_center_y) / prior_box_height;
+      output[offset + 2] =
+          std::log(std::fabs(target_box_width / prior_box_width));
+      output[offset + 3] =
+          std::log(std::fabs(target_box_height / prior_box_height));
+    }
+  }
+
+  if (prior_box_var_data) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int64_t k = 0; k < len; ++k) {
+          size_t offset = i * col * len + j * len;
+          int prior_var_offset = j * len;
+          output[offset + k] /= prior_box_var_data[prior_var_offset + k];
+        }
+      }
+    }
+  } else if (!(variance.empty())) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        for (int64_t k = 0; k < len; ++k) {
+          size_t offset = i * col * len + j * len;
+          output[offset + k] /= variance[k];
+        }
+      }
+    }
+  }
+}
+
+void decode_center_size(const int axis,
+                        const int var_size,
+                        const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int64_t i = 0; i < row; ++i) {
+    for (int64_t j = 0; j < col; ++j) {
+      float var_data[4] = {1., 1., 1., 1.};
+      float* var_ptr = var_data;
+      size_t offset = i * col * len + j * len;
+      int prior_box_offset = axis == 0 ? j * len : i * len;
+
+      float prior_box_width = prior_box_data[prior_box_offset + 2] -
+                              prior_box_data[prior_box_offset] +
+                              (normalized == false);
+      float prior_box_height = prior_box_data[prior_box_offset + 3] -
+                               prior_box_data[prior_box_offset + 1] +
+                               (normalized == false);
+      float prior_box_center_x =
+          prior_box_data[prior_box_offset] + prior_box_width / 2;
+      float prior_box_center_y =
+          prior_box_data[prior_box_offset + 1] + prior_box_height / 2;
+
+      float target_box_center_x = 0, target_box_center_y = 0;
+      float target_box_width = 0, target_box_height = 0;
+      int prior_var_offset = axis == 0 ? j * len : i * len;
+      if (var_size == 2) {
+        std::memcpy(
+            var_ptr, prior_box_var_data + prior_var_offset, 4 * sizeof(float));
+      } else if (var_size == 1) {
+        var_ptr = const_cast<float*>(variance.data());
+      }
+      float box_var_x = *var_ptr;
+      float box_var_y = *(var_ptr + 1);
+      float box_var_w = *(var_ptr + 2);
+      float box_var_h = *(var_ptr + 3);
+
+      target_box_center_x =
+          box_var_x * target_box_data[offset] * prior_box_width +
+          prior_box_center_x;
+      target_box_center_y =
+          box_var_y * target_box_data[offset + 1] * prior_box_height +
+          prior_box_center_y;
+      target_box_width =
+          std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width;
+      target_box_height =
+          std::exp(box_var_h * target_box_data[offset + 3]) * prior_box_height;
+
+      output[offset] = target_box_center_x - target_box_width / 2;
+      output[offset + 1] = target_box_center_y - target_box_height / 2;
+      output[offset + 2] =
+          target_box_center_x + target_box_width / 2 - (normalized == false);
+      output[offset + 3] =
+          target_box_center_y + target_box_height / 2 - (normalized == false);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/box_coder.h b/lite/backends/x86/math/box_coder.h
new file mode 100644
index 0000000000000000000000000000000000000000..fc31f888ab7ed281533e187ca8b51344f150662a
--- /dev/null
+++ b/lite/backends/x86/math/box_coder.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/x86/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void encode_center_size(const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output);
+
+void decode_center_size(const int axis,
+                        const int var_size,
+                        const int64_t row,
+                        const int64_t col,
+                        const int64_t len,
+                        const float* target_box_data,
+                        const float* prior_box_data,
+                        const float* prior_box_var_data,
+                        const bool normalized,
+                        const std::vector<float> variance,
+                        float* output);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h
index 72a2f4ce12cbd72b26cd87e97d0178275a4b4abd..6363488c4ccbe0a22245e96d62feab53f6a55185 100644
--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
@@ -161,7 +161,7 @@ class ContextProjectFunctor {
                       sequence_width});
 
         if (up_pad > 0) {  // add up pad
-          int padding_rows = std::min(
+          int padding_rows = (std::min)(
               up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
 
           for (int k = 0; k < padding_rows; ++k) {
@@ -180,10 +180,10 @@ class ContextProjectFunctor {
         }
         if (down_pad > 0) {  // add down pad
           int down_pad_begin_row =
-              std::max(0,
-                       (sequence_height - context_start - context_length) + 1) +
+              (std::max)(
+                  0, (sequence_height - context_start - context_length) + 1) +
               1;
-          int padding_begin = std::max(0, context_start - sequence_height);
+          int padding_begin = (std::max)(0, context_start - sequence_height);
           int padding_size =
               sequence_height - context_start >= context_length
                   ? 1
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
index 4393c42157bb7667ec2218e8b76f05a2c60bcc86..ae2a0cd3319dad56589b631b961f0e3a1098a45f 100644
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -67,8 +67,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
             hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
@@ -76,8 +76,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
             }
 
             T ele = pool_process.initial();
@@ -150,8 +150,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
             hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
@@ -159,8 +159,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
             }
             int pool_size = (exclusive || adaptive)
                                 ? (hend - hstart) * (wend - wstart)
@@ -228,12 +228,12 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
           int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
-          hstart = std::max(hstart, 0);
+          int hend = (std::min)(hstart + ksize_height, input_height);
+          hstart = (std::max)(hstart, 0);
           for (int pw = 0; pw < output_width; ++pw) {
             int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
-            wstart = std::max(wstart, 0);
+            int wend = (std::min)(wstart + ksize_width, input_width);
+            wstart = (std::max)(wstart, 0);
 
             bool stop = false;
             for (int h = hstart; h < hend && !stop; ++h) {
@@ -337,8 +337,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
             dend = AdaptEndIndex(pd, input_depth, output_depth);
           } else {
             dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
@@ -346,8 +346,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
               hend = AdaptEndIndex(ph, input_height, output_height);
             } else {
               hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
@@ -355,8 +355,8 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
               }
               int output_idx = (pd * output_height + ph) * output_width + pw;
               T ele = pool_process.initial();
@@ -441,8 +441,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
             dend = AdaptEndIndex(pd, input_depth, output_depth);
           } else {
             dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
@@ -450,8 +450,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
               hend = AdaptEndIndex(ph, input_height, output_height);
             } else {
               hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
@@ -459,8 +459,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
               }
 
               int pool_size =
@@ -540,16 +540,16 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
           int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
-          dstart = std::max(dstart, 0);
+          int dend = (std::min)(dstart + ksize_depth, input_depth);
+          dstart = (std::max)(dstart, 0);
           for (int ph = 0; ph < output_height; ++ph) {
             int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            int hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
             for (int pw = 0; pw < output_width; ++pw) {
               int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              int wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
               bool stop = false;
               for (int d = dstart; d < dend && !stop; ++d) {
                 for (int h = hstart; h < hend && !stop; ++h) {
@@ -651,8 +651,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
             hend = AdaptEndIndex(ph, input_height, output_height);
           } else {
             hstart = ph * stride_height - padding_height;
-            hend = std::min(hstart + ksize_height, input_height);
-            hstart = std::max(hstart, 0);
+            hend = (std::min)(hstart + ksize_height, input_height);
+            hstart = (std::max)(hstart, 0);
           }
           for (int pw = 0; pw < output_width; ++pw) {
             if (adaptive) {
@@ -660,8 +660,8 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
               wend = AdaptEndIndex(pw, input_width, output_width);
             } else {
               wstart = pw * stride_width - padding_width;
-              wend = std::min(wstart + ksize_width, input_width);
-              wstart = std::max(wstart, 0);
+              wend = (std::min)(wstart + ksize_width, input_width);
+              wstart = (std::max)(wstart, 0);
             }
 
             T1 ele = static_cast<T1>(-FLT_MAX);
@@ -794,8 +794,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
             dend = AdaptEndIndex(pd, input_depth, output_depth);
           } else {
             dstart = pd * stride_depth - padding_depth;
-            dend = std::min(dstart + ksize_depth, input_depth);
-            dstart = std::max(dstart, 0);
+            dend = (std::min)(dstart + ksize_depth, input_depth);
+            dstart = (std::max)(dstart, 0);
           }
           for (int ph = 0; ph < output_height; ++ph) {
             if (adaptive) {
@@ -803,8 +803,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
               hend = AdaptEndIndex(ph, input_height, output_height);
             } else {
               hstart = ph * stride_height - padding_height;
-              hend = std::min(hstart + ksize_height, input_height);
-              hstart = std::max(hstart, 0);
+              hend = (std::min)(hstart + ksize_height, input_height);
+              hstart = (std::max)(hstart, 0);
             }
             for (int pw = 0; pw < output_width; ++pw) {
               if (adaptive) {
@@ -812,8 +812,8 @@ class MaxPool3dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
                 wend = AdaptEndIndex(pw, input_width, output_width);
               } else {
                 wstart = pw * stride_width - padding_width;
-                wend = std::min(wstart + ksize_width, input_width);
-                wstart = std::max(wstart, 0);
+                wend = (std::min)(wstart + ksize_width, input_width);
+                wstart = (std::max)(wstart, 0);
               }
 
               int output_idx = (pd * output_height + ph) * output_width + pw;
diff --git a/lite/backends/x86/math/prior_box.cc b/lite/backends/x86/math/prior_box.cc
new file mode 100644
index 0000000000000000000000000000000000000000..159838895ad8145e4db81f5f3701ec8ddb2611a4
--- /dev/null
+++ b/lite/backends/x86/math/prior_box.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/prior_box.h"
+#include <algorithm>
+#include <string>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void density_prior_box(const int64_t img_width,
+                       const int64_t img_height,
+                       const int64_t feature_width,
+                       const int64_t feature_height,
+                       const float* input_data,
+                       const float* image_data,
+                       const bool clip,
+                       const std::vector<float> variances,
+                       const std::vector<float> fixed_sizes,
+                       const std::vector<float> fixed_ratios,
+                       const std::vector<int> densities,
+                       const float step_width,
+                       const float step_height,
+                       const float offset,
+                       const int num_priors,
+                       float* boxes_data,
+                       float* vars_data) {
+  int step_average = static_cast<int>((step_width + step_height) * 0.5);
+
+  std::vector<float> sqrt_fixed_ratios;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (size_t i = 0; i < fixed_ratios.size(); i++) {
+    sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
+  }
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(2)
+#endif
+  for (int64_t h = 0; h < feature_height; ++h) {
+    for (int64_t w = 0; w < feature_width; ++w) {
+      float center_x = (w + offset) * step_width;
+      float center_y = (h + offset) * step_height;
+      int64_t offset = (h * feature_width + w) * num_priors * 4;
+      // Generate density prior boxes with fixed sizes.
+      for (size_t s = 0; s < fixed_sizes.size(); ++s) {
+        auto fixed_size = fixed_sizes[s];
+        int density = densities[s];
+        int shift = step_average / density;
+        // Generate density prior boxes with fixed ratios.
+        for (size_t r = 0; r < fixed_ratios.size(); ++r) {
+          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
+          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
+          float density_center_x = center_x - step_average / 2. + shift / 2.;
+          float density_center_y = center_y - step_average / 2. + shift / 2.;
+          for (int di = 0; di < density; ++di) {
+            for (int dj = 0; dj < density; ++dj) {
+              float center_x_temp = density_center_x + dj * shift;
+              float center_y_temp = density_center_y + di * shift;
+              boxes_data[offset++] = std::max(
+                  (center_x_temp - box_width_ratio / 2.) / img_width, 0.);
+              boxes_data[offset++] = std::max(
+                  (center_y_temp - box_height_ratio / 2.) / img_height, 0.);
+              boxes_data[offset++] = std::min(
+                  (center_x_temp + box_width_ratio / 2.) / img_width, 1.);
+              boxes_data[offset++] = std::min(
+                  (center_y_temp + box_height_ratio / 2.) / img_height, 1.);
+            }
+          }
+        }
+      }
+    }
+  }
+  //! clip the prior's coordinate such that it is within [0, 1]
+  if (clip) {
+    int channel_size = feature_height * feature_width * num_priors * 4;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+    for (int d = 0; d < channel_size; ++d) {
+      boxes_data[d] = std::min(std::max(boxes_data[d], 0.f), 1.f);
+    }
+  }
+//! set the variance.
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for collapse(3)
+#endif
+  for (int h = 0; h < feature_height; ++h) {
+    for (int w = 0; w < feature_width; ++w) {
+      for (int i = 0; i < num_priors; ++i) {
+        int idx = ((h * feature_width + w) * num_priors + i) * 4;
+        vars_data[idx++] = variances[0];
+        vars_data[idx++] = variances[1];
+        vars_data[idx++] = variances[2];
+        vars_data[idx++] = variances[3];
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/prior_box.h b/lite/backends/x86/math/prior_box.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b090551a014a8019e38f5fdcede38b86bfab720
--- /dev/null
+++ b/lite/backends/x86/math/prior_box.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/x86/math/math_function.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+void density_prior_box(const int64_t img_width,
+                       const int64_t img_height,
+                       const int64_t feature_width,
+                       const int64_t feature_height,
+                       const float* input_data,
+                       const float* image_data,
+                       const bool clip,
+                       const std::vector<float> variances,
+                       const std::vector<float> fixed_sizes,
+                       const std::vector<float> fixed_ratios,
+                       const std::vector<int> densities,
+                       const float step_width,
+                       const float step_height,
+                       const float offset,
+                       const int num_priors,
+                       float* boxes_data,
+                       float* vars_data);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h
index 5512c4aa11fb5dc05283d01b1d6d3da7fb83c064..f254242714d92852498b3cc72fed0a911510e829 100644
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -35,7 +35,7 @@ inline static uint64_t MaximumSequenceLength(
   uint64_t seq_num = seq_offset.size() - 1;
   uint64_t max_seq_len = 0;
   for (size_t i = 0; i < seq_num; ++i) {
-    max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
+    max_seq_len = (std::max)(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
   }
   return max_seq_len;
 }
diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h
index 49794b8e15a8f90a6512798baa842534df879f6b..33ba672778fa53f4af77c8cbb663b163c2b9c5a3 100644
--- a/lite/backends/x86/parallel.h
+++ b/lite/backends/x86/parallel.h
@@ -26,7 +26,7 @@ namespace x86 {
 
 static void SetNumThreads(int num_threads) {
 #ifdef PADDLE_WITH_MKLML
-  int real_num_threads = std::max(num_threads, 1);
+  int real_num_threads = (std::max)(num_threads, 1);
   x86::MKL_Set_Num_Threads(real_num_threads);
   omp_set_num_threads(real_num_threads);
 #endif
@@ -52,14 +52,14 @@ static inline void RunParallelFor(const int64_t begin,
   }
 
 #ifdef PADDLE_WITH_MKLML
-  int64_t num_threads = std::min(GetMaxThreads(), end - begin);
+  int64_t num_threads = (std::min)(GetMaxThreads(), end - begin);
   if (num_threads > 1) {
 #pragma omp parallel num_threads(num_threads)
     {
       int64_t tid = omp_get_thread_num();
       int64_t chunk_size = (end - begin + num_threads - 1) / num_threads;
       int64_t begin_tid = begin + tid * chunk_size;
-      f(begin_tid, std::min(end, chunk_size + begin_tid));
+      f(begin_tid, (std::min)(end, chunk_size + begin_tid));
     }
     return;
   }
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
index a3d8729410299170964e3ce3b59feb4b970a121b..5f5eae4703a0a0c5db3f026dabaea76d3371b03a 100644
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -18,6 +18,27 @@
 namespace paddle {
 namespace lite {
 
+void XPUScratchPad::Reserve(size_t new_size) {
+  if (new_size <= size_) {
+    return;
+  }
+
+  if (!is_l3_) {
+    TargetWrapperXPU::Free(addr_);
+    addr_ = TargetWrapperXPU::Malloc(new_size);
+    size_ = new_size;
+  } else {
+    CHECK(false) << "Not supported if is_l3_ == true";
+  }
+}
+
+void XPUScratchPadDeleter::operator()(XPUScratchPad* sp) const {
+  if (!sp->is_l3_) {
+    TargetWrapperXPU::Free(sp->addr_);
+  }
+  delete sp;
+}
+
 void* TargetWrapperXPU::Malloc(size_t size) {
   void* ptr{nullptr};
   XPU_CALL(xpu_malloc(&ptr, size));
@@ -51,7 +72,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
     ptr = TargetWrapperXPU::Malloc(size);
   }
   CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
-  return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
+  return XPUScratchPadGuard(new XPUScratchPad(ptr, size, use_l3));
 }
 
 std::string TargetWrapperXPU::multi_encoder_precision;  // NOLINT
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
index 1a888b126a43783ddae5654de38f5b2e201eaa5e..8151d733ba4b506d3d24fd7e7c150c5f12f1e691 100644
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -37,19 +37,19 @@ const int XPU_MAX_LOD_SEQ_LEN = 512;
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
 struct XPUScratchPad {
-  XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {}
+  XPUScratchPad(void* addr, size_t size, bool is_l3)
+      : addr_(addr), size_(size), is_l3_(is_l3) {}
+
+  // XXX(miaotianxiang): |size_| increases monotonically
+  void Reserve(size_t new_size);
 
   void* addr_{nullptr};
+  size_t size_{0};
   bool is_l3_{false};
 };
 
 struct XPUScratchPadDeleter {
-  void operator()(XPUScratchPad* sp) const {
-    if (!sp->is_l3_) {
-      XPU_CALL(xpu_free(sp->addr_));
-    }
-    delete sp;
-  }
+  void operator()(XPUScratchPad* sp) const;
 };
 
 using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index f6f8b231fe5448ca65f86e1234208c97d6860622..2a7751cd2a635ca83a602f7a53a1487e263b8c78 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -2,7 +2,7 @@ if (WITH_TESTING)
     lite_cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest gflags)
 endif()
 lite_cc_library(target_wrapper SRCS target_wrapper.cc
-  DEPS target_wrapper_host place
+  DEPS target_wrapper_host place fbs_headers
   X86_DEPS target_wrapper_x86
   CUDA_DEPS target_wrapper_cuda
   XPU_DEPS target_wrapper_xpu
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index cd135f85b3b55641ae1996b2d3b933e1da7870dc..0cf13ab6996df09f76d32e9482455a87d53a5e15 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -176,6 +176,9 @@ void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
         case 0xd0a:
           arch_type = kA75;
           break;
+        case 0xd0d:
+          arch_type = kA77;
+          break;
         case 0xd40:
           arch_type = kA76;
           break;
@@ -637,6 +640,20 @@ void DeviceInfo::SetArchInfo(int argc, ...) {
 
 bool DeviceInfo::SetCPUInfoByName() {
   /* Snapdragon */
+  if (dev_name_.find("KONA") != std::string::npos) {  // 865
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA77, kA55);
+    SetCacheInfo(0, 2, 192 * 1024, 256 * 1024);
+    SetCacheInfo(1, 2, 768 * 1024, 512 * 1024);
+    SetCacheInfo(2, 1, 4 * 1024 * 1024);
+    SetFP16Info(1, 1);
+    SetDotInfo(2, 1, 1);
+    return true;
+  }
   if (dev_name_.find("SM8150") != std::string::npos) {  // 855
     core_num_ = 8;
     core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index c95f285e1433e9ca55595d4a5f0cb814c488fe7b..bc82245c8d47379901f6454aecedea5842ce1973 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -17,6 +17,7 @@
 #include <cstdarg>
 #include <string>
 #include <vector>
+#include "lite/api/paddle_api.h"
 #include "lite/core/tensor.h"
 #include "lite/utils/cp_logging.h"
 #ifdef LITE_WITH_MLU
@@ -27,6 +28,7 @@
 namespace paddle {
 namespace lite {
 
+using L3CacheSetMethod = lite_api::L3CacheSetMethod;
 #if ((defined LITE_WITH_ARM) || (defined LITE_WITH_MLU))
 
 typedef enum {
@@ -38,6 +40,8 @@ typedef enum {
   kA73 = 73,
   kA75 = 75,
   kA76 = 76,
+  kA77 = 77,
+  kA78 = 78,
   kARMArch_UNKOWN = -1
 } ARMArch;
 
@@ -65,11 +69,41 @@ class DeviceInfo {
   int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
   int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
   int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
+  // Methods for allocating L3Cache on Arm platform
+  // Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
+  void SetArmL3CacheSize(
+      L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache,
+      int absolute_val = -1) {
+    l3_cache_method_ = method;
+    absolute_l3cache_size_ = absolute_val;
+    // Realloc memory for sgemm in this context.
+    workspace_.clear();
+    workspace_.Resize({llc_size()});
+    workspace_.mutable_data<int8_t>();
+  }
+
   int llc_size() const {
-    auto size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
-                                              : L2_cache_[active_ids_[0]];
+    auto size = absolute_l3cache_size_;
+    switch (l3_cache_method_) {
+      // kDeviceL3Cache = 0, use the system L3 Cache size, best performance.
+      case L3CacheSetMethod::kDeviceL3Cache:
+        size = L3_cache_[active_ids_[0]] > 0 ? L3_cache_[active_ids_[0]]
+                                             : L2_cache_[active_ids_[0]];
+        break;
+      // kDeviceL2Cache = 1, use the system L2 Cache size, trade off performance
+      // with less memory consumption.
+      case L3CacheSetMethod::kDeviceL2Cache:
+        size = L2_cache_[active_ids_[0]];
+        break;
+      // kAbsolute = 2, use the external setting.
+      case L3CacheSetMethod::kAbsolute:
+        break;
+      default:
+        LOG(FATAL) << "Error: unknown l3_cache_method_ !";
+    }
     return size > 0 ? size : 512 * 1024;
   }
+
   bool has_dot() const { return dot_[active_ids_[0]]; }
   bool has_fp16() const { return fp16_[active_ids_[0]]; }
 
@@ -121,6 +155,10 @@ class DeviceInfo {
   void RequestPowerRandHighMode(int shift_num, int thread_num);
   void RequestPowerRandLowMode(int shift_num, int thread_num);
 
+  // Methods for allocating L3Cache on Arm platform
+  // Enum class L3CacheSetMethod is declared in `lite/api/paddle_api.h`
+  L3CacheSetMethod l3_cache_method_{L3CacheSetMethod::kDeviceL3Cache};
+  int absolute_l3cache_size_{-1};
   DeviceInfo() = default;
 };
 #endif  // LITE_WITH_ARM
diff --git a/lite/core/memory.h b/lite/core/memory.h
index c80c8fb6b6e1356ebfa52920a8ee39f61ed20692..872cfd120ca0db889ec6cacebcba1431dafc931b 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <algorithm>
 #include <string>
 #include "lite/api/paddle_place.h"
 #include "lite/core/target_wrapper.h"
@@ -140,20 +141,21 @@ class Buffer {
 #ifdef LITE_WITH_OPENCL
   template <typename T>
   void ResetLazyImage2D(TargetType target,
-                        const size_t img_w,
-                        const size_t img_h,
+                        const size_t img_w_req,
+                        const size_t img_h_req,
                         void* host_ptr = nullptr) {
-    if (target != target_ || cl_image2d_width_ < img_w ||
-        cl_image2d_height_ < img_h || host_ptr != nullptr) {
+    if (target != target_ || cl_image2d_width_ < img_w_req ||
+        cl_image2d_height_ < img_h_req || host_ptr != nullptr) {
       CHECK_EQ(own_data_, true) << "Can not reset unowned buffer.";
+      cl_image2d_width_ = std::max(cl_image2d_width_, img_w_req);
+      cl_image2d_height_ = std::max(cl_image2d_height_, img_h_req);
       Free();
-      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
+      data_ = TargetWrapperCL::MallocImage<T>(
+          cl_image2d_width_, cl_image2d_height_, host_ptr);
       target_ = target;
-      space_ = sizeof(T) * img_w * img_h *
+      space_ = sizeof(T) * cl_image2d_width_ * cl_image2d_height_ *
                4;  // un-used for opencl Image2D, 4 for RGBA,
       cl_use_image2d_ = true;
-      cl_image2d_width_ = img_w;
-      cl_image2d_height_ = img_h;
     }
   }
 #endif
diff --git a/lite/core/memory_test.cc b/lite/core/memory_test.cc
index cd9062afca7fbf05ef639fed34c50bdf8ee3cb7a..6343854db2b75f7db1fff852056f3c4d86a48c85 100644
--- a/lite/core/memory_test.cc
+++ b/lite/core/memory_test.cc
@@ -28,6 +28,12 @@ TEST(memory, test) {
   ASSERT_TRUE(buf_cuda);
   TargetFree(TARGET(kCUDA), buf_cuda);
 #endif
+
+#ifdef LITE_WITH_OPENCL
+  auto* buf_cl = TargetMalloc(TARGET(kOpenCL), 10);
+  ASSERT_TRUE(buf_cl);
+  TargetFree(TARGET(kOpenCL), buf_cl);
+#endif
 }
 
 }  // namespace lite
diff --git a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
index d8e9d9db4664cd717dbc949134e5ef52f52c9b61..adafa0f5b546b3dd4beb3352e8087a7099c4931e 100644
--- a/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__conv2d_fuse_pass.cc
@@ -244,6 +244,7 @@ class XPUConv2dBlock0Fuser : public FuseBase {
 
     std::string output_name = "";
     if (_with_relu) {
+      op_desc.SetAttr("act_type", std::string{"relu"});
       output_name = matched.at("relu_out")->arg()->name;
     } else {
       output_name = matched.at("bn_out")->arg()->name;
@@ -433,6 +434,7 @@ class XPUConv2dBlock1Fuser : public FuseBase {
         TARGET(kXPU), PRECISION(kFloat), DATALAYOUT(kNCHW));
     scope->NewTensor(max_output_name);
     op_desc.SetOutput("OutputMax", {max_output_name});
+    op_desc.SetAttr("act_type", std::string{"relu"});
 
     auto conv_op = LiteOpRegistry::Global().Create("__xpu__conv2d");
     auto& valid_places = conv_old->valid_places();
diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
index 39773e272a3345454c00c4da4b7e7c69617afd69..0692928dd212dd6bfc61f7a53e6321ac93439993 100644
--- a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc
@@ -307,7 +307,7 @@ class XPUResNetBlock0Fuser : public FuseBase {
                          matched.at("right_bn1_variance")->arg()->name,
                      });
     op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
-    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    // keep these to fool SubgraphOp::AttachImpl()
     op_desc.SetAttr<int>("sub_block", 0);
     op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
     op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
@@ -570,7 +570,7 @@ class XPUResNetBlock1Fuser : public FuseBase {
                          matched.at("right_bn3_variance")->arg()->name,
                      });
     op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
-    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    // keep these to fool SubgraphOp::AttachImpl()
     op_desc.SetAttr<int>("sub_block", 0);
     op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
     op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
@@ -599,9 +599,658 @@ class XPUResNetBlock1Fuser : public FuseBase {
   }
 };
 
+class XPUResNetDtypeBlock0Fuser : public FuseBase {
+ public:
+  XPUResNetDtypeBlock0Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("conv2d", "Input")
+                      ->assert_is_op_input("pool2d", "X")
+                      ->AsInput();
+
+    auto* left_conv1_weight = VarNode("left_conv1_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv1 = OpNode("left_conv1", "conv2d");
+    auto* left_conv1_out = VarNode("left_conv1_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn1_scale = VarNode("left_bn1_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn1_bias = VarNode("left_bn1_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn1_mean = VarNode("left_bn1_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn1_var = VarNode("left_bn1_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
+    auto* left_bn1_out = VarNode("left_bn1_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn1_var_out =
+        VarNode("left_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn1_saved_mean =
+        VarNode("left_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn1_saved_var =
+        VarNode("left_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
+    auto* left_relu1_out = VarNode("left_relu1_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv2_weight = VarNode("left_conv2_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
+    auto* left_conv2_out = VarNode("left_conv2_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn2_scale = VarNode("left_bn2_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn2_bias = VarNode("left_bn2_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn2_mean = VarNode("left_bn2_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn2_var = VarNode("left_bn2_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
+    auto* left_bn2_out = VarNode("left_bn2_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn2_var_out =
+        VarNode("left_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn2_saved_mean =
+        VarNode("left_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn2_saved_var =
+        VarNode("left_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
+    auto* left_relu2_out = VarNode("left_relu2_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv3_weight = VarNode("left_conv3_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
+    auto* left_conv3_out = VarNode("left_conv3_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn3_scale = VarNode("left_bn3_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn3_bias = VarNode("left_bn3_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn3_mean = VarNode("left_bn3_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn3_var = VarNode("left_bn3_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
+    auto* left_bn3_out = VarNode("left_bn3_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("elementwise_add", "Y")
+                             ->AsIntermediate();
+    auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn3_var_out =
+        VarNode("left_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn3_saved_mean =
+        VarNode("left_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn3_saved_var =
+        VarNode("left_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* right_pool = OpNode("right_pool", "pool2d")->AsIntermediate();
+    auto* right_pool_out = VarNode("right_pool_out")
+                               ->assert_is_op_output("pool2d", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
+        *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
+        *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
+        *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add;
+
+    *left_conv1_weight >> *left_conv1;
+    *left_bn1_scale >> *left_bn1;
+    *left_bn1_bias >> *left_bn1;
+    *left_bn1_mean >> *left_bn1;
+    *left_bn1_var >> *left_bn1;
+    *left_bn1 >> *left_bn1_mean_out;
+    *left_bn1 >> *left_bn1_var_out;
+    *left_bn1 >> *left_bn1_saved_mean;
+    *left_bn1 >> *left_bn1_saved_var;
+
+    *left_conv2_weight >> *left_conv2;
+    *left_bn2_scale >> *left_bn2;
+    *left_bn2_bias >> *left_bn2;
+    *left_bn2_mean >> *left_bn2;
+    *left_bn2_var >> *left_bn2;
+    *left_bn2 >> *left_bn2_mean_out;
+    *left_bn2 >> *left_bn2_var_out;
+    *left_bn2 >> *left_bn2_saved_mean;
+    *left_bn2 >> *left_bn2_saved_var;
+
+    *left_conv3_weight >> *left_conv3;
+    *left_bn3_scale >> *left_bn3;
+    *left_bn3_bias >> *left_bn3;
+    *left_bn3_mean >> *left_bn3;
+    *left_bn3_var >> *left_bn3;
+    *left_bn3 >> *left_bn3_mean_out;
+    *left_bn3 >> *left_bn3_var_out;
+    *left_bn3 >> *left_bn3_saved_mean;
+    *left_bn3 >> *left_bn3_saved_var;
+
+    *input >> *right_pool >> *right_pool_out >> *right_conv1 >>
+        *right_conv1_out >> *right_bn1 >> *right_bn1_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_block0_d");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("left_conv1_weight")->arg()->name,
+                         matched.at("left_conv2_weight")->arg()->name,
+                         matched.at("left_conv3_weight")->arg()->name,
+                         matched.at("right_conv1_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("left_bn1_scale")->arg()->name,
+                         matched.at("left_bn2_scale")->arg()->name,
+                         matched.at("left_bn3_scale")->arg()->name,
+                         matched.at("right_bn1_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("left_bn1_bias")->arg()->name,
+                         matched.at("left_bn2_bias")->arg()->name,
+                         matched.at("left_bn3_bias")->arg()->name,
+                         matched.at("right_bn1_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("left_bn1_mean")->arg()->name,
+                         matched.at("left_bn2_mean")->arg()->name,
+                         matched.at("left_bn3_mean")->arg()->name,
+                         matched.at("right_bn1_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("left_bn1_variance")->arg()->name,
+                         matched.at("left_bn2_variance")->arg()->name,
+                         matched.at("left_bn3_variance")->arg()->name,
+                         matched.at("right_bn1_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block0_stmt = matched.at("left_conv1")->stmt();
+    // block0_stmt->ResetOp(op_desc, graph->valid_places());
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
+    sub_program_desc->AddBlock<cpp::BlockDesc>();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetProgramDesc(sub_program_desc);
+    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
+    block0_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "left_conv2_weight",
+        "left_conv3_weight",
+        "right_conv1_weight",
+        "left_bn1_bias",
+        "left_bn2_bias",
+        "left_bn3_bias",
+        "right_bn1_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
+  }
+};
+
 class XPUResNet50Fuser : public xpu::XPUFuseBase {
  public:
-  XPUResNet50Fuser() {}
+  XPUResNet50Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* top_conv_weight = VarNode("top_conv_weight")
+                                ->assert_is_op_input("conv2d", "Filter")
+                                ->AsInput();
+    auto* top_conv = OpNode("top_conv", "conv2d");
+    auto* top_conv_out = VarNode("top_conv_out")
+                             ->assert_is_op_output("conv2d", "Output")
+                             ->assert_is_op_input("batch_norm", "X")
+                             ->AsIntermediate();
+    auto* top_bn_scale = VarNode("top_bn_scale")
+                             ->assert_is_op_input("batch_norm", "Scale")
+                             ->AsIntermediate();
+    auto* top_bn_bias = VarNode("top_bn_bias")
+                            ->assert_is_op_input("batch_norm", "Bias")
+                            ->AsInput();
+    auto* top_bn_mean = VarNode("top_bn_mean")
+                            ->assert_is_op_input("batch_norm", "Mean")
+                            ->AsIntermediate();
+    auto* top_bn_var = VarNode("top_bn_variance")
+                           ->assert_is_op_input("batch_norm", "Variance")
+                           ->AsIntermediate();
+    auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
+    auto* top_bn_out = VarNode("top_bn_out")
+                           ->assert_is_op_output("batch_norm", "Y")
+                           ->assert_is_op_input("relu", "X")
+                           ->AsIntermediate();
+    auto* top_bn_mean_out = VarNode("top_bn_mean_out")
+                                ->assert_is_op_output("batch_norm", "MeanOut")
+                                ->AsIntermediate();
+    auto* top_bn_var_out =
+        VarNode("top_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* top_bn_saved_mean =
+        VarNode("top_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* top_bn_saved_var =
+        VarNode("top_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
+    auto* top_relu_out = VarNode("top_relu_out")
+                             ->assert_is_op_output("relu", "Out")
+                             ->assert_is_op_input("pool2d", "X")
+                             ->AsIntermediate();
+    auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
+    auto* top_pool_out = VarNode("top_pool_out")
+                             ->assert_is_op_output("pool2d", "Out")
+                             ->assert_is_op_input("resnet_block0", "Inputs")
+                             ->AsIntermediate();
+
+    // args are left out
+    auto* resnet_block0_1 =
+        OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_1_out =
+        VarNode("resnet_block0_1_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_1 =
+        OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_1_out =
+        VarNode("resnet_block1_1_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_2 =
+        OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_1_2_out =
+        VarNode("resnet_block1_1_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_2 =
+        OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_2_out =
+        VarNode("resnet_block0_2_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_1 =
+        OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_1_out =
+        VarNode("resnet_block1_2_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_2 =
+        OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_2_out =
+        VarNode("resnet_block1_2_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_3 =
+        OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_2_3_out =
+        VarNode("resnet_block1_2_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_3 =
+        OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_3_out =
+        VarNode("resnet_block0_3_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_1 =
+        OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_1_out =
+        VarNode("resnet_block1_3_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_2 =
+        OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_2_out =
+        VarNode("resnet_block1_3_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_3 =
+        OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_3_out =
+        VarNode("resnet_block1_3_3_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_4 =
+        OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_4_out =
+        VarNode("resnet_block1_3_4_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_5 =
+        OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_3_5_out =
+        VarNode("resnet_block1_3_5_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_4 =
+        OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
+    auto* resnet_block0_4_out =
+        VarNode("resnet_block0_4_out")
+            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_1 =
+        OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_1_out =
+        VarNode("resnet_block1_4_1_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_2 =
+        OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate();
+    auto* resnet_block1_4_2_out =
+        VarNode("resnet_block1_4_2_out")
+            ->assert_is_op_output("resnet_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate();
+    auto* bottom_pool_out = VarNode("bottom_pool_out")
+                                ->assert_is_op_output("pool2d", "Out")
+                                ->AsOutput();
+
+    *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
+        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
+        *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
+        *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
+        *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
+        *resnet_block1_2_2_out >> *resnet_block1_2_3 >>
+        *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
+        *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
+        *resnet_block1_3_2_out >> *resnet_block1_3_3 >>
+        *resnet_block1_3_3_out >> *resnet_block1_3_4 >>
+        *resnet_block1_3_4_out >> *resnet_block1_3_5 >>
+        *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
+        *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
+        *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out;
+
+    *top_conv_weight >> *top_conv;
+    *top_bn_scale >> *top_bn;
+    *top_bn_bias >> *top_bn;
+    *top_bn_mean >> *top_bn;
+    *top_bn_var >> *top_bn;
+    *top_bn >> *top_bn_mean_out;
+    *top_bn >> *top_bn_var_out;
+    *top_bn >> *top_bn_saved_mean;
+    *top_bn >> *top_bn_saved_var;
+  }
+
+  void InsertNewNode(SSAGraph* graph,
+                     const key2nodes_t& matched,
+                     const std::vector<Node*>& extra_input_vars) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__resnet50");
+    op_desc.SetInput("Input", {matched.at("input")->arg()->name});
+    std::vector<std::string> filter_name = {
+        matched.at("top_conv_weight")->arg()->name};
+    std::vector<std::string> scale_name = {
+        matched.at("top_bn_scale")->arg()->name};
+    std::vector<std::string> bias_name = {
+        matched.at("top_bn_bias")->arg()->name};
+    std::vector<std::string> mean_name = {
+        matched.at("top_bn_mean")->arg()->name};
+    std::vector<std::string> var_name = {
+        matched.at("top_bn_variance")->arg()->name};
+    std::vector<std::string> max_filter_name;
+    std::vector<std::string> resnet_block_vec = {
+        "resnet_block0_1",
+        "resnet_block1_1_1",
+        "resnet_block1_1_2",
+        "resnet_block0_2",
+        "resnet_block1_2_1",
+        "resnet_block1_2_2",
+        "resnet_block1_2_3",
+        "resnet_block0_3",
+        "resnet_block1_3_1",
+        "resnet_block1_3_2",
+        "resnet_block1_3_3",
+        "resnet_block1_3_4",
+        "resnet_block1_3_5",
+        "resnet_block0_4",
+        "resnet_block1_4_1",
+        "resnet_block1_4_2",
+    };
+    for (auto& block : resnet_block_vec) {
+      auto* block_op_info = matched.at(block)->stmt()->op_info();
+      auto block_filter_name = block_op_info->Input("Filter");
+      std::copy(block_filter_name.begin(),
+                block_filter_name.end(),
+                std::back_inserter(filter_name));
+      auto block_scale_name = block_op_info->Input("Scale");
+      std::copy(block_scale_name.begin(),
+                block_scale_name.end(),
+                std::back_inserter(scale_name));
+      auto block_bias_name = block_op_info->Input("Bias");
+      std::copy(block_bias_name.begin(),
+                block_bias_name.end(),
+                std::back_inserter(bias_name));
+      auto block_mean_name = block_op_info->Input("Mean");
+      std::copy(block_mean_name.begin(),
+                block_mean_name.end(),
+                std::back_inserter(mean_name));
+      auto block_var_name = block_op_info->Input("Var");
+      std::copy(block_var_name.begin(),
+                block_var_name.end(),
+                std::back_inserter(var_name));
+    }
+    op_desc.SetInput("Filter", filter_name);
+    op_desc.SetInput("Bias", bias_name);
+    op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name});
+    op_desc.SetAttr<int>("xpu", 1);
+
+    auto* resnet50_stmt = matched.at("top_conv")->stmt();
+    auto* scope = resnet50_stmt->op()->scope();
+    for (size_t i = 0; i < filter_name.size(); ++i) {
+      auto* filter_t = scope->FindMutableTensor(filter_name[i]);
+      auto* scale_t = scope->FindMutableTensor(scale_name[i]);
+      auto* bias_t = scope->FindMutableTensor(bias_name[i]);
+      auto* mean_t = scope->FindMutableTensor(mean_name[i]);
+      auto* var_t = scope->FindMutableTensor(var_name[i]);
+
+      int mean_len = mean_t->numel();
+      int filter_len = filter_t->numel();
+      int filter_stride = filter_len / mean_len;
+
+      float* filter_on_host = filter_t->mutable_data<float>();
+      float* scale_on_host = scale_t->mutable_data<float>();
+      float* bias_on_host = bias_t->mutable_data<float>();
+      float* mean_on_host = mean_t->mutable_data<float>();
+      float* var_on_host = var_t->mutable_data<float>();
+
+      // Perform preprocess
+      for (int i = 0; i < mean_len; ++i) {
+        scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; ++j) {
+          filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+        }
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+      }
+
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+      std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          filter_on_host, filter_int16.get(), max_f, filter_len);
+      memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+      // create new arg in graph and scope
+      std::string max_name = filter_name[i] + "_max";
+      max_filter_name.push_back(max_name);
+      auto* max_filter_node = graph->NewArgumentNode(max_name);
+      max_filter_node->arg()->is_weight = true;
+      max_filter_node->arg()->type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      DirectedLink(max_filter_node, matched.at("top_conv"));
+      auto* max_filter_t = scope->NewTensor(max_name);
+      max_filter_t->Resize({4});
+      float* max_ptr = max_filter_t->mutable_data<float>();
+      max_ptr[0] = max_f;
+      max_ptr[1] = max_f;
+      max_ptr[2] = max_f;
+      max_ptr[3] = max_f;
+    }
+    op_desc.SetInput("MaxFilter", max_filter_name);
+
+    auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    resnet50_op->Attach(op_desc, scope);
+    resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places());
+    auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places());
+    resnet50_stmt->SetOp(resnet50_op);
+    resnet50_stmt->SetKernels(std::move(kernels));
+
+    IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    for (auto* node : extra_input_vars) {
+      IR_NODE_LINK_TO(node, matched.at("top_conv"));
+    }
+    IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out"));
+  }
+};
+
+class XPUResNet50DtypeFuser : public xpu::XPUFuseBase {
+ public:
+  XPUResNet50DtypeFuser() {}
 
   void BuildPattern() override {
     auto* input =
@@ -650,8 +1299,102 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
     auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
     auto* top_relu_out = VarNode("top_relu_out")
                              ->assert_is_op_output("relu", "Out")
-                             ->assert_is_op_input("pool2d", "X")
+                             ->assert_is_op_input("conv2d", "Input")
+                             ->AsIntermediate();
+
+    auto* second_conv_weight = VarNode("second_conv_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* second_conv = OpNode("second_conv", "conv2d")->AsIntermediate();
+    auto* second_conv_out = VarNode("second_conv_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* second_bn_scale = VarNode("second_bn_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* second_bn_bias = VarNode("second_bn_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* second_bn_mean = VarNode("second_bn_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* second_bn_var = VarNode("second_bn_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* second_bn = OpNode("second_bn", "batch_norm")->AsIntermediate();
+    auto* second_bn_out = VarNode("second_bn_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* second_bn_mean_out =
+        VarNode("second_bn_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* second_bn_var_out =
+        VarNode("second_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* second_bn_saved_mean =
+        VarNode("second_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* second_bn_saved_var =
+        VarNode("second_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* second_relu = OpNode("second_relu", "relu")->AsIntermediate();
+    auto* second_relu_out = VarNode("second_relu_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* third_conv_weight = VarNode("third_conv_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* third_conv = OpNode("third_conv", "conv2d")->AsIntermediate();
+    auto* third_conv_out = VarNode("third_conv_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* third_bn_scale = VarNode("third_bn_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* third_bn_bias = VarNode("third_bn_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* third_bn_mean = VarNode("third_bn_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* third_bn_var = VarNode("third_bn_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* third_bn = OpNode("third_bn", "batch_norm")->AsIntermediate();
+    auto* third_bn_out = VarNode("third_bn_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
                              ->AsIntermediate();
+    auto* third_bn_mean_out = VarNode("third_bn_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* third_bn_var_out =
+        VarNode("third_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* third_bn_saved_mean =
+        VarNode("third_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* third_bn_saved_var =
+        VarNode("third_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* third_relu = OpNode("third_relu", "relu")->AsIntermediate();
+    auto* third_relu_out = VarNode("third_relu_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("pool2d", "X")
+                               ->AsIntermediate();
+
     auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
     auto* top_pool_out = VarNode("top_pool_out")
                              ->assert_is_op_output("pool2d", "Out")
@@ -679,10 +1422,10 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
             ->AsIntermediate();
 
     auto* resnet_block0_2 =
-        OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate();
+        OpNode("resnet_block0_2", "resnet_block0_d")->AsIntermediate();
     auto* resnet_block0_2_out =
         VarNode("resnet_block0_2_out")
-            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->assert_is_op_output("resnet_block0_d", "Outputs")
             ->AsIntermediate();
     auto* resnet_block1_2_1 =
         OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate();
@@ -704,10 +1447,10 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
             ->AsIntermediate();
 
     auto* resnet_block0_3 =
-        OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate();
+        OpNode("resnet_block0_3", "resnet_block0_d")->AsIntermediate();
     auto* resnet_block0_3_out =
         VarNode("resnet_block0_3_out")
-            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->assert_is_op_output("resnet_block0_d", "Outputs")
             ->AsIntermediate();
     auto* resnet_block1_3_1 =
         OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate();
@@ -741,10 +1484,10 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
             ->AsIntermediate();
 
     auto* resnet_block0_4 =
-        OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate();
+        OpNode("resnet_block0_4", "resnet_block0_d")->AsIntermediate();
     auto* resnet_block0_4_out =
         VarNode("resnet_block0_4_out")
-            ->assert_is_op_output("resnet_block0", "Outputs")
+            ->assert_is_op_output("resnet_block0_d", "Outputs")
             ->AsIntermediate();
     auto* resnet_block1_4_1 =
         OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate();
@@ -765,7 +1508,10 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
                                 ->AsOutput();
 
     *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
-        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *top_relu >> *top_relu_out >> *second_conv >> *second_conv_out >>
+        *second_bn >> *second_bn_out >> *second_relu >> *second_relu_out >>
+        *third_conv >> *third_conv_out >> *third_bn >> *third_bn_out >>
+        *third_relu >> *third_relu_out >> *top_pool >> *top_pool_out >>
         *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
         *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
         *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
@@ -789,24 +1535,59 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
     *top_bn >> *top_bn_var_out;
     *top_bn >> *top_bn_saved_mean;
     *top_bn >> *top_bn_saved_var;
+
+    *second_conv_weight >> *second_conv;
+    *second_bn_scale >> *second_bn;
+    *second_bn_bias >> *second_bn;
+    *second_bn_mean >> *second_bn;
+    *second_bn_var >> *second_bn;
+    *second_bn >> *second_bn_mean_out;
+    *second_bn >> *second_bn_var_out;
+    *second_bn >> *second_bn_saved_mean;
+    *second_bn >> *second_bn_saved_var;
+
+    *third_conv_weight >> *third_conv;
+    *third_bn_scale >> *third_bn;
+    *third_bn_bias >> *third_bn;
+    *third_bn_mean >> *third_bn;
+    *third_bn_var >> *third_bn;
+    *third_bn >> *third_bn_mean_out;
+    *third_bn >> *third_bn_var_out;
+    *third_bn >> *third_bn_saved_mean;
+    *third_bn >> *third_bn_saved_var;
   }
 
   void InsertNewNode(SSAGraph* graph,
                      const key2nodes_t& matched,
                      const std::vector<Node*>& extra_input_vars) override {
     cpp::OpDesc op_desc;
-    op_desc.SetType("__xpu__resnet50");
+    op_desc.SetType("__xpu__resnet50_d");
     op_desc.SetInput("Input", {matched.at("input")->arg()->name});
     std::vector<std::string> filter_name = {
-        matched.at("top_conv_weight")->arg()->name};
+        matched.at("top_conv_weight")->arg()->name,
+        matched.at("second_conv_weight")->arg()->name,
+        matched.at("third_conv_weight")->arg()->name};
+
     std::vector<std::string> scale_name = {
-        matched.at("top_bn_scale")->arg()->name};
+        matched.at("top_bn_scale")->arg()->name,
+        matched.at("second_bn_scale")->arg()->name,
+        matched.at("third_bn_scale")->arg()->name};
+
     std::vector<std::string> bias_name = {
-        matched.at("top_bn_bias")->arg()->name};
+        matched.at("top_bn_bias")->arg()->name,
+        matched.at("second_bn_bias")->arg()->name,
+        matched.at("third_bn_bias")->arg()->name};
+
     std::vector<std::string> mean_name = {
-        matched.at("top_bn_mean")->arg()->name};
+        matched.at("top_bn_mean")->arg()->name,
+        matched.at("second_bn_mean")->arg()->name,
+        matched.at("third_bn_mean")->arg()->name};
+
     std::vector<std::string> var_name = {
-        matched.at("top_bn_variance")->arg()->name};
+        matched.at("top_bn_variance")->arg()->name,
+        matched.at("second_bn_variance")->arg()->name,
+        matched.at("third_bn_variance")->arg()->name};
+
     std::vector<std::string> max_filter_name;
     std::vector<std::string> resnet_block_vec = {
         "resnet_block0_1",
@@ -900,7 +1681,9 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
       max_filter_node->arg()->is_weight = true;
       max_filter_node->arg()->type = LiteType::GetTensorTy(
           TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+
       DirectedLink(max_filter_node, matched.at("top_conv"));
+
       auto* max_filter_t = scope->NewTensor(max_name);
       max_filter_t->Resize({4});
       float* max_ptr = max_filter_t->mutable_data<float>();
@@ -919,6 +1702,11 @@ class XPUResNet50Fuser : public xpu::XPUFuseBase {
     resnet50_stmt->SetKernels(std::move(kernels));
 
     IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    IR_NODE_LINK_TO(matched.at("second_conv_weight"), matched.at("top_conv"));
+    IR_NODE_LINK_TO(matched.at("second_bn_bias"), matched.at("top_conv"));
+    IR_NODE_LINK_TO(matched.at("third_conv_weight"), matched.at("top_conv"));
+    IR_NODE_LINK_TO(matched.at("third_bn_bias"), matched.at("top_conv"));
+
     for (auto* node : extra_input_vars) {
       IR_NODE_LINK_TO(node, matched.at("top_conv"));
     }
@@ -951,6 +1739,31 @@ class XPUResNet50FusePass : public ProgramPass {
   }
 };
 
+class XPUResNet50DtypeFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    bool changed = false;
+    SSAGraph backup;
+    backup.CloneFrom(*graph);
+
+    fusion::XPUResNetBlock0Fuser block0_fuser;
+    changed |= block0_fuser(graph.get());
+    fusion::XPUResNetDtypeBlock0Fuser d_type_block0_fuser;
+    changed |= d_type_block0_fuser(graph.get());
+    fusion::XPUResNetBlock1Fuser block1_fuser;
+    changed |= block1_fuser(graph.get());
+    fusion::XPUResNet50DtypeFuser resnet50_d_fuser;
+    size_t n_matches = resnet50_d_fuser(graph.get());
+
+    if (changed && !n_matches) {
+      // Restore graph from backuped one if no whole ResNet50 graph was found
+      graph->CloneFrom(backup);
+    }
+  }
+};
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
@@ -959,3 +1772,8 @@ REGISTER_MIR_PASS(__xpu__resnet_fuse_pass,
                   paddle::lite::mir::XPUResNet50FusePass)
     .BindTargets({TARGET(kXPU)})
     .BindKernel("__xpu__resnet50");
+
+REGISTER_MIR_PASS(__xpu__resnet_d_fuse_pass,
+                  paddle::lite::mir::XPUResNet50DtypeFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__resnet50_d");
diff --git a/lite/core/mir/fusion/conv_conv_fuse_pass.cc b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
index d277da87689d7aa1f21ef260013b6e81f2146a09..b2c5d8d15ab95fbcc43adc01c4189ae83b1316ed 100644
--- a/lite/core/mir/fusion/conv_conv_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_conv_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
+#include <list>
 #include <memory>
 #include <vector>
 #include "lite/core/mir/fusion/conv_conv_fuser.h"
@@ -27,13 +28,10 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // initialze fuser params
   std::vector<bool> conv_has_bias_cases{true, false};
   std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
-  bool has_fp32 = false;
   bool has_int8 = false;
+  bool has_weight_quant = false;
   for (auto& place : graph->valid_places()) {
     if (place.target == TARGET(kARM) || place.target == TARGET(kHost)) {
-      if (place.precision == PRECISION(kFloat)) {
-        has_fp32 = true;
-      }
       if (place.precision == PRECISION(kInt8)) {
         has_int8 = true;
       }
@@ -42,8 +40,18 @@ void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       return;
     }
   }
+  const std::list<mir::Node>& nodes = graph->nodes();
+  for (auto& node : nodes) {
+    if (node.IsStmt()) {
+      auto* op_info = (node.stmt())->op_info();
+      if (op_info->HasAttr("quantization_type")) {
+        has_weight_quant = true;
+        break;
+      }
+    }
+  }
   // only support arm-fp32
-  if (has_int8 || (has_fp32 && has_int8)) {
+  if (has_int8 || has_weight_quant) {
     return;
   }
   // only support fp32 fusion
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index da42d6d0c79a2a7975eacca7095fedababac6d89..4840a625c7551e96fa5f3ae03585bedf9a85c303 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -61,5 +61,4 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_quant_dequant_fuse_pass,
                   paddle::lite::mir::QuantDequantFusePass)
-    .BindTargets({TARGET(kAny)})
-    .BindKernel("calib");
+    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 3817d0049c9e302b5b39aae6bca96dff2180bd73..bf1867ac3be2c8c9f8c1c39db156eee31b31c127 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -148,7 +148,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
           int cur_life =
               (*lifecycles)[TargetToStr(target_type)][var_name].second;
           (*lifecycles)[TargetToStr(target_type)][var_name].second =
-              std::max(max_lifecycle_, cur_life);
+              (std::max)(max_lifecycle_, cur_life);
         }
       }
       ++max_lifecycle_;
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index 1b6c55e5e2b533c48a4a34feab9e0c5d5a157d73..3ecd92049d0f4838e80d743b82276cb7b6dfa79f 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -61,7 +61,7 @@ class StaticKernelPickPass : public mir::StmtPass {
     float final_score{-1.};
     Place winner_place{places[0]};
     const int kMax =
-        std::numeric_limits<core::KernelPickFactor::value_type>::max();
+        (std::numeric_limits<core::KernelPickFactor::value_type>::max)();
     size_t place_size = places.size();
 
     // NOTE: We compare kernel's place with place in valid_places to select the
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 5a57623b0c984be24e2d0b97ee575b22d369fdad..1a615838e33b6688d7213787a7aa6ec35ed7f0b4 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -17,8 +17,6 @@
 #include <cmath>
 
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index 44b6eaf1eb0c5c96630dd66d129919b40f3ea8c6..c1529aacf85c713c6c381974c408b536c608fa61 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -82,8 +82,11 @@ void TypeLayoutTransformPass::ComplementInputs(SSAGraph* graph,
   // not a good judge, but don't find the source of this issue from
   // static_pick_kernel_pass
   // to this pass.
+  auto is_host = [](TargetType x) -> bool {
+    return x == TARGET(kHost) || x == TARGET(kX86) || x == TARGET(kARM);
+  };
   auto* in_arg_type = const_cast<Type*>(in->AsArg().type);
-  if (in_arg_type->target() == TARGET(kARM) &&
+  if (is_host(in_arg_type->target()) &&
       in_arg_type->layout() == DATALAYOUT(kImageDefault)) {
     return;
   }
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index 585aaf3b703bca0a0a34030106dbf793e2a31d52..dcab292be8f24a6294cb560506f6d03209552d4a 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -233,67 +233,98 @@ bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const {
   return false;
 }
 
-bool OpInfo::HasInputScale(const std::string &input_name) const {
-  std::string argname;
-  int index;
-  if (GetInputArgname(input_name, &argname) &&
-      GetInputIndex(input_name, &index)) {
-    return HasAttr(argname + to_string(index) + "_scale");
+bool OpInfo::HasInputScale(const std::string &name, bool is_scale_name) const {
+  bool res = false;
+  if (is_scale_name) {
+    res = HasAttr(name);
   } else {
-    return false;
+    std::string argname;
+    int index;
+    if (GetInputArgname(name, &argname) && GetInputIndex(name, &index)) {
+      res = HasAttr(argname + to_string(index) + "_scale");
+    }
   }
+  return res;
 }
 
-bool OpInfo::HasOutputScale(const std::string &output_name) const {
-  std::string argname;
-  int index;
-  if (GetOutputArgname(output_name, &argname) &&
-      GetOutputIndex(output_name, &index)) {
-    return HasAttr(argname + to_string(index) + "_scale");
+bool OpInfo::HasOutputScale(const std::string &name, bool is_scale_name) const {
+  bool res = false;
+  if (is_scale_name) {
+    res = HasAttr(name);
   } else {
-    return false;
+    std::string argname;
+    int index;
+    if (GetOutputArgname(name, &argname) && GetOutputIndex(name, &index)) {
+      res = HasAttr(argname + to_string(index) + "_scale");
+    }
   }
+  return res;
 }
 
-void OpInfo::SetInputScale(const std::string &input_name,
-                           const std::vector<float> &scale_value) {
-  std::string argname;
-  int index;
-  CHECK(GetInputArgname(input_name, &argname));
-  CHECK(GetInputIndex(input_name, &index));
-  CHECK(scale_value.size() > 0)
-      << "Error in SetInputScale: the scales should not be empty";
-  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
-                              scale_value);
+void OpInfo::SetInputScale(const std::string &name,
+                           const std::vector<float> &scale_value,
+                           bool is_scale_name) {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
+    std::string argname;
+    int index;
+    CHECK(GetInputArgname(name, &argname));
+    CHECK(GetInputIndex(name, &index));
+    CHECK(scale_value.size() > 0)
+        << "Error in SetInputScale: the scales should not be empty";
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  SetAttr<std::vector<float>>(scale_name, scale_value);
 }
 
-void OpInfo::SetOutputScale(const std::string &output_name,
-                            const std::vector<float> &scale_value) {
-  std::string argname;
-  int index;
-  CHECK(GetOutputArgname(output_name, &argname));
-  CHECK(GetOutputIndex(output_name, &index));
-  CHECK(scale_value.size() > 0)
-      << "Error in SetOutputScale: the scales should not be empty";
-  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
-                              scale_value);
+void OpInfo::SetOutputScale(const std::string &name,
+                            const std::vector<float> &scale_value,
+                            bool is_scale_name) {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
+    std::string argname;
+    int index;
+    CHECK(GetOutputArgname(name, &argname));
+    CHECK(GetOutputIndex(name, &index));
+    CHECK(scale_value.size() > 0)
+        << "Error in SetOutputScale: the scales should not be empty";
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  SetAttr<std::vector<float>>(scale_name, scale_value);
 }
 
-std::vector<float> OpInfo::GetInputScale(const std::string &input_name) const {
-  std::string argname;
-  int index;
-  CHECK(GetInputArgname(input_name, &argname));
-  CHECK(GetInputIndex(input_name, &index));
-  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+std::vector<float> OpInfo::GetInputScale(const std::string &name,
+                                         bool is_scale_name) const {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
+    std::string argname;
+    int index;
+    CHECK(GetInputArgname(name, &argname));
+    CHECK(GetInputIndex(name, &index));
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  return GetAttr<std::vector<float>>(scale_name);
 }
 
-std::vector<float> OpInfo::GetOutputScale(
-    const std::string &output_name) const {
-  std::string argname;
-  int index;
-  CHECK(GetOutputArgname(output_name, &argname));
-  CHECK(GetOutputIndex(output_name, &index));
-  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+std::vector<float> OpInfo::GetOutputScale(const std::string &name,
+                                          bool is_scale_name) const {
+  std::string scale_name;
+  if (is_scale_name) {
+    scale_name = name;
+  } else {
+    std::string argname;
+    int index;
+    CHECK(GetOutputArgname(name, &argname));
+    CHECK(GetOutputIndex(name, &index));
+    scale_name = argname + to_string(index) + "_scale";
+  }
+  return GetAttr<std::vector<float>>(scale_name);
 }
 
 }  // namespace lite
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index d94753220a1b5d963092c62c43d7e49b03243c63..1e664152a39110bdfc28cbb037920b6174315aa5 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -251,19 +251,31 @@ class OpInfo : public cpp::OpDesc {
   bool GetInputIndex(const std::string &input_name, int *out) const;
   bool GetOutputIndex(const std::string &output_name, int *out) const;
 
-  bool HasInputScale(const std::string &input_name) const;
-  bool HasOutputScale(const std::string &output_name) const;
+  // If a quantized op has two input argname (X, Y) and one output
+  // argname (Out). The scales of input argname X are saved in op desc as
+  // (X0_scale, scale_value_0), (X1_scale, scale_value_1)...
+  // The following APIs get or set the quantized scale in op_desc.
+  // If use the input or output name, the is_scale_name should be false.
+  // If use the scale_name such as (X0_scale, scale_value_0),
+  // the is_scale_name should be true.
+  bool HasInputScale(const std::string &name, bool is_scale_name = false) const;
+  bool HasOutputScale(const std::string &name,
+                      bool is_scale_name = false) const;
 
   void SetInputScale(const std::string &input_name,
-                     const std::vector<float> &scale_value);
+                     const std::vector<float> &scale_value,
+                     bool is_scale_name = false);
   void SetOutputScale(const std::string &output_name,
-                      const std::vector<float> &scale_value);
+                      const std::vector<float> &scale_value,
+                      bool is_scale_name = false);
 
   // For conv2d, depthwise_conv2d and mul, the scale of weight are a vector.
   // Otherwise, all input and output scales are scalar, but we save these
   // as vecotr.
-  std::vector<float> GetInputScale(const std::string &input_name) const;
-  std::vector<float> GetOutputScale(const std::string &output_name) const;
+  std::vector<float> GetInputScale(const std::string &name,
+                                   bool is_scale_name = false) const;
+  std::vector<float> GetOutputScale(const std::string &name,
+                                    bool is_scale_name = false) const;
 };
 
 }  // namespace lite
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 2dfc444a26ffe013ad05c81a003dd073cc133177..7709090c038cf81bee5a735b682ea0721ee30ec1 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -80,97 +80,99 @@ class Optimizer {
     InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
 
     if (passes.empty() || passes.size() == 1) {
-      std::vector<std::string> passes_local{
-          {"lite_quant_dequant_fuse_pass",         //
-           "weight_quantization_preprocess_pass",  //
-           "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
-           "lite_conv_bn_fuse_pass",               //
-           "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
-           "lite_conv_conv_fuse_pass",             //
-           // TODO(Superjomn) Refine the fusion related design to select fusion
-           // kernels for devices automatically.
-           "lite_conv_activation_fuse_pass",              //
-           "lite_var_conv_2d_activation_fuse_pass",       //
-           "lite_match_matrix_activation_fuse_pass",      //
-           "lite_fc_fuse_pass",                           //
-           "lite_shuffle_channel_fuse_pass",              //
-           "lite_transpose_softmax_transpose_fuse_pass",  //
-           "lite_interpolate_fuse_pass",                  //
-           "identity_scale_eliminate_pass",               //
-           "lite_scales_fuse_pass",                       //
-           "lite_sequence_reverse_embedding_fuse_pass",   //
-           "elementwise_mul_constant_eliminate_pass",     //
-           "lite_sequence_pool_concat_fuse_pass",         //
-           "lite_scale_activation_fuse_pass",             //
+      std::vector<std::string> passes_local{{
+          "lite_quant_dequant_fuse_pass",         //
+          "weight_quantization_preprocess_pass",  //
+          "lite_conv_elementwise_fuse_pass",      // conv-elemwise-bn
+          "lite_conv_bn_fuse_pass",               //
+          "lite_conv_elementwise_fuse_pass",      // conv-bn-elemwise
+          "lite_conv_conv_fuse_pass",             //
+          // TODO(Superjomn) Refine the fusion related design to select fusion
+          // kernels for devices automatically.
+          "lite_conv_activation_fuse_pass",              //
+          "lite_var_conv_2d_activation_fuse_pass",       //
+          "lite_match_matrix_activation_fuse_pass",      //
+          "lite_fc_fuse_pass",                           //
+          "lite_shuffle_channel_fuse_pass",              //
+          "lite_transpose_softmax_transpose_fuse_pass",  //
+          "lite_interpolate_fuse_pass",                  //
+          "identity_scale_eliminate_pass",               //
+          "lite_scales_fuse_pass",                       //
+          "lite_sequence_reverse_embedding_fuse_pass",   //
+          "elementwise_mul_constant_eliminate_pass",     //
+          "lite_sequence_pool_concat_fuse_pass",         //
+          "lite_scale_activation_fuse_pass",             //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
     (defined LITE_WITH_ARM)
-           "lite_elementwise_activation_fuse_pass",  //
+          "lite_elementwise_activation_fuse_pass",  //
 #endif
-           "identity_dropout_eliminate_pass",
-           "__xpu__resnet_fuse_pass",
-           "__xpu__resnet_cbam_fuse_pass",
-           "__xpu__conv2d_fuse_pass",
-           "__xpu__conv2d_link_previous_out_max_pass",
-           "__xpu__sfa_head_meanstd_fuse_pass",
-           "__xpu__sfa_head_moment_fuse_pass",
-           "__xpu__mmdnn_fuse_pass",
-           "__xpu__multi_encoder_fuse_pass",
-           "__xpu__embedding_with_eltwise_add_fuse_pass",
-           "__xpu__fc_fuse_pass",
-           "quantized_op_attributes_inference_pass",  // Only for fully
-                                                      // quantized model, infer
-                                                      // the output scale and
-                                                      // fix the attribute
-                                                      // 'enable_int8' for all
-                                                      // of the quantized ops.
-           "npu_subgraph_pass",
-           "huawei_ascend_npu_subgraph_pass",
-           "xpu_subgraph_pass",
-           "bm_subgraph_pass",
-           "apu_subgraph_pass",
-           "rknpu_subgraph_pass",
-           "mlu_subgraph_pass",
-           "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
-           "static_kernel_pick_pass",  // pick original kernel from graph
-
-           "remove_tf_redundant_ops_pass",
-           "variable_place_inference_pass",  // inference arg/var's
-
-           "mlu_postprocess_pass",
-           // info(target/precision/layout/device)
-           // using kernel info
-           "argument_type_display_pass",  // debug pass: show arg-type-node's
-                                          // info
-                                          // (target/precision/layout/device)
-
-           "type_target_cast_pass",  // add io_copy/io_copy_once if meet
-                                     // different targets when last and next
-                                     // node
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "io_copy_kernel_pick_pass",    //
-           "argument_type_display_pass",  //
-
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "type_precision_cast_pass",       //
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
-
-           "type_layout_cast_pass",  // add layout/layout_once op if meet
-                                     // different layout when last and next node
-           "argument_type_display_pass",  //
-
-           "variable_place_inference_pass",  //
-           "argument_type_display_pass",
-
-           "runtime_context_assign_pass",
-           "argument_type_display_pass",
-           "lite_reshape_fuse_pass",
-
-           "memory_optimize_pass"}};
+          "identity_dropout_eliminate_pass",
+          "__xpu__resnet_fuse_pass",
+          "__xpu__resnet_d_fuse_pass",
+          "__xpu__resnet_cbam_fuse_pass",
+          "__xpu__conv2d_fuse_pass",
+          "__xpu__conv2d_link_previous_out_max_pass",
+          "__xpu__sfa_head_meanstd_fuse_pass",
+          "__xpu__sfa_head_moment_fuse_pass",
+          "__xpu__mmdnn_fuse_pass",
+          "__xpu__multi_encoder_fuse_pass",
+          "__xpu__embedding_with_eltwise_add_fuse_pass",
+          "__xpu__fc_fuse_pass",
+          "quantized_op_attributes_inference_pass",  // Only for fully
+                                                     // quantized model, infer
+                                                     // the output scale and
+                                                     // fix the attribute
+                                                     // 'enable_int8' for all
+                                                     // of the quantized ops.
+          "npu_subgraph_pass",
+          "huawei_ascend_npu_subgraph_pass",
+          "xpu_subgraph_pass",
+          "bm_subgraph_pass",
+          "apu_subgraph_pass",
+          "rknpu_subgraph_pass",
+          "mlu_subgraph_pass",
+          "control_flow_op_unused_inputs_and_outputs_eliminate_pass",
+          "static_kernel_pick_pass",  // pick original kernel from graph
+
+          "remove_tf_redundant_ops_pass",
+          "variable_place_inference_pass",  // inference arg/var's
+
+          "mlu_postprocess_pass",
+          // info(target/precision/layout/device)
+          // using kernel info
+          "argument_type_display_pass",  // debug pass: show arg-type-node's
+                                         // info
+                                         // (target/precision/layout/device)
+
+          "type_target_cast_pass",  // add io_copy/io_copy_once if meet
+                                    // different targets when last and next
+                                    // node
+          "variable_place_inference_pass",  //
+          "argument_type_display_pass",     //
+
+          "io_copy_kernel_pick_pass",    //
+          "argument_type_display_pass",  //
+
+          "variable_place_inference_pass",  //
+          "argument_type_display_pass",     //
+
+          "type_precision_cast_pass",       //
+          "variable_place_inference_pass",  //
+          "argument_type_display_pass",     //
+
+          "type_layout_cast_pass",  // add layout/layout_once op if meet
+                                    // different layout when last and next node
+          "argument_type_display_pass",  //
+
+          "variable_place_inference_pass",  //
+          "argument_type_display_pass",
+
+          "runtime_context_assign_pass",
+          "argument_type_display_pass",
+          "lite_reshape_fuse_pass",
+          "memory_optimize_pass"  // you can comment this line when enable
+                                  // PRECISION_PROFILE
+      }};
 
       if (passes.size() == 1) {
         // multi_stream_analysis_pass must be in the front of
diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h
index fda2b74f8f37f4705382f768b353150fa0bda3d7..5ad541ad7c1464299bfde62d7340f4d80c20831d 100644
--- a/lite/core/profile/precision_profiler.h
+++ b/lite/core/profile/precision_profiler.h
@@ -18,10 +18,18 @@
  * of each kernel.
  */
 #pragma once
+
+#include <sys/time.h>
+#include <time.h>
+
 #include <cmath>
+#include <cstdlib>
+#include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/program.h"
+#include "lite/utils/io.h"
 #ifdef LITE_WITH_X86
 #include "lite/fluid/float16.h"
 #endif
@@ -40,14 +48,50 @@ namespace paddle {
 namespace lite {
 namespace profile {
 
+static const std::string get_date_str() {
+  struct tm tm_time;
+  time_t timestamp = time(NULL);
+  localtime_r(&timestamp, &tm_time);
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+
+  // print date / time
+  std::string date_str =
+      std::to_string(1900 + tm_time.tm_year) +
+      std::to_string(1 + tm_time.tm_mon) + std::to_string(tm_time.tm_mday) +
+      '_' + std::to_string(tm_time.tm_hour) + std::to_string(tm_time.tm_min) +
+      std::to_string(tm_time.tm_sec) + '_' + std::to_string(tv.tv_usec / 1000);
+  return date_str;
+}
+
+inline std::string generate_valid_tensor_name(const std::string& name) {
+  std::string new_name("");
+  for (size_t i = 0; i < name.length(); ++i) {
+    if (name[i] != '/') {
+      new_name += name[i];
+    } else {
+      new_name += "_";
+    }
+  }
+  return new_name;
+}
+
 template <typename dtype>
-static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
-  if (locate.find('/') != std::string::npos) {
-    return false;
+static bool write_tensorfile(
+    const Tensor* tensor,
+    const std::string& tensor_name,
+    const std::string prefix_path = "/storage/emulated/0/") {
+  std::string new_tensor_name = generate_valid_tensor_name(tensor_name);
+  if (tensor_name.find('/') != std::string::npos) {
+    LOG(ERROR) << "--> tensor name is abnormal with '\\':" << tensor_name
+               << " !!!, replace with '_'," << new_tensor_name
+               << new_tensor_name;
   }
-  FILE* fp = fopen(locate.c_str(), "w");
+
+  std::string tensor_save_path = prefix_path + new_tensor_name + ".txt";
+  FILE* fp = fopen(tensor_save_path.c_str(), "w");
   if (fp == nullptr) {
-    LOG(ERROR) << "file open field " << locate;
+    LOG(ERROR) << "failed open file " << tensor_save_path;
     return false;
   } else {
     const dtype* data = tensor->data<dtype>();
@@ -56,19 +100,23 @@ static bool write_tensorfile(const Tensor* tensor, const std::string& locate) {
     }
   }
   fclose(fp);
+  LOG(INFO) << "write tensor " << tensor_name
+            << " to file:" << tensor_save_path;
   return true;
 }
 
-static bool write_precision_summary_tofile(const std::string& string,
-                                           const std::string& log_dir = "") {
-  if (log_dir == "") {
-    LOG(INFO) << "The `log_dir` of precision summary file is not set. log_dir:"
-              << log_dir;
+static bool write_precision_summary_tofile(
+    const std::string& string, const std::string& summary_log_dir = "") {
+  if (summary_log_dir == "") {
+    LOG(INFO) << "The `summary_log_dir` of precision summary file is not set. "
+                 "summary_log_dir:"
+              << summary_log_dir;
     return false;
   }
-  FILE* fp = fopen(log_dir.c_str(), "a");
+
+  FILE* fp = fopen(summary_log_dir.c_str(), "a");
   if (fp == nullptr) {
-    LOG(INFO) << "Open precision summary file:" << log_dir << "failed.";
+    LOG(INFO) << "Open precision summary file:" << summary_log_dir << "failed.";
     return false;
   } else {
     fprintf(fp, "%s\n", string.c_str());
@@ -85,7 +133,14 @@ class PrecisionProfiler {
     std::string inst_precison_str = GetInstPrecision(inst);
   }
 
-  PrecisionProfiler() {}
+  PrecisionProfiler() {
+    MkDirRecur(log_dir_);
+    const char* write_to_file_raw =
+        std::getenv("PADDLELITE_PRECISION_WRITE_TO_FILE");
+    write_result_to_file_ = (write_to_file_raw && atoi(write_to_file_raw) > 0)
+                                ? atoi(write_to_file_raw) > 0
+                                : false;
+  }
 
   std::string GetSummaryHeader() {
     using std::setw;
@@ -102,9 +157,9 @@ class PrecisionProfiler {
        << " " << setw(15) << left << "std_deviation"
        << " " << setw(15) << left << "ave_grow_rate*" << std::endl;
 
-    // write to file with path: `log_dir`
-    if (log_dir_ != "") {
-      FILE* fp = fopen(log_dir_.c_str(), "a");
+    // write to file with path: `summary_log_dir`
+    if (summary_log_dir_ != "") {
+      FILE* fp = fopen(summary_log_dir_.c_str(), "a");
       std::string header_str{ss.str()};
       fprintf(fp, "%s\n", header_str.c_str());
       fclose(fp);
@@ -112,6 +167,18 @@ class PrecisionProfiler {
     return ss.str();
   }
 
+  std::string GetSummaryTail() {
+    STL::stringstream ss;
+    ss << "[note]" << std::endl;
+    ss << "1. `ave_grow_rate`: show the sequence value of tensor when std_dev "
+          "& mean are same."
+       << std::endl;
+    ss << "2. Enable write each output tensor to file: `export "
+          "PADDLELITE_PRECISION_WRITE_TO_FILE=1` on ADB command line."
+       << std::endl;
+    return ss.str();
+  }
+
   template <typename T>
   double compute_mean(const T* in, const size_t length) {
     double sum = 0.;
@@ -157,6 +224,17 @@ class PrecisionProfiler {
     return false;
   }
 
+  std::string rename_out_for_mem_reuse_pass(const std::string& old_name) {
+    if (out_tensor_names_map.find(old_name) == out_tensor_names_map.end()) {
+      out_tensor_names_map[old_name] = 1;
+    } else {
+      ++out_tensor_names_map[old_name];
+    }
+    std::string new_name =
+        old_name + "_" + std::to_string(out_tensor_names_map[old_name]);
+    return new_name;
+  }
+
   void compute_tensor_precision_info(const Tensor* in,
                                      TargetType target_type,
                                      PrecisionType precision_type,
@@ -180,7 +258,7 @@ class PrecisionProfiler {
           *std_dev =
               compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
           return;
         }
         case PRECISION(kAny): {
@@ -189,7 +267,7 @@ class PrecisionProfiler {
           *std_dev =
               compute_standard_deviation<float>(ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<float>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
           return;
         }
         case PRECISION(kInt8): {
@@ -198,7 +276,7 @@ class PrecisionProfiler {
           *std_dev =
               compute_standard_deviation<int8_t>(ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<int8_t>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<int8_t>(in, name);
+          write_result_to_file&& write_tensorfile<int8_t>(in, name, log_dir_);
           return;
         }
         case PRECISION(kInt32): {
@@ -207,7 +285,7 @@ class PrecisionProfiler {
           *std_dev = compute_standard_deviation<int32_t>(
               ptr, in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<int32_t>(ptr, in->numel());
-          write_result_to_file&& write_tensorfile<int32_t>(in, name);
+          write_result_to_file&& write_tensorfile<int32_t>(in, name, log_dir_);
           return;
         }
         case PRECISION(kInt64): {
@@ -254,7 +332,14 @@ class PrecisionProfiler {
               real_out_v.data(), in->numel(), true, *mean);
           *ave_grow_rate = compute_average_grow_rate<float>(real_out_v.data(),
                                                             real_out_v.size());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
+          real_out_t->Resize(in->dims());
+          float* real_out_data = real_out_t->mutable_data<float>();
+          memcpy(real_out_data,
+                 real_out_v.data(),
+                 real_out_v.size() * sizeof(float));
+          write_result_to_file&& write_tensorfile<float>(
+              real_out_t.get(), name, log_dir_);
           return;
         }
         case DATALAYOUT(kNCHW): {
@@ -269,7 +354,14 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          std::shared_ptr<lite::Tensor> real_out_t(new lite::Tensor);
+          real_out_t->Resize(in->dims());
+          float* real_out_data = real_out_t->mutable_data<float>();
+          memcpy(real_out_data,
+                 in_data_v.data(),
+                 in_data_v.size() * sizeof(float));
+          write_result_to_file&& write_tensorfile<float>(
+              real_out_t.get(), name, log_dir_);
           return;
         }
         default:
@@ -296,7 +388,7 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
           return;
         }
         case PRECISION(kInt32): {
@@ -311,7 +403,7 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<int>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
           return;
         }
         case PRECISION(kInt64): {
@@ -326,7 +418,7 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<int64_t>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
           return;
         }
         case PRECISION(kFP16): {
@@ -347,7 +439,7 @@ class PrecisionProfiler {
               in_data_v.data(), in->numel(), true, *mean);
           *ave_grow_rate =
               compute_average_grow_rate<float>(in_data_v.data(), in->numel());
-          write_result_to_file&& write_tensorfile<float>(in, name);
+          write_result_to_file&& write_tensorfile<float>(in, name, log_dir_);
           return;
         }
         default:
@@ -372,12 +464,12 @@ class PrecisionProfiler {
     using std::left;
     using std::fixed;
     STL::stringstream ss;
-    bool write_result_to_file = false;
 
     VLOG(1) << ">> Running kernel: " << inst->op()->op_info()->Repr()
             << " registered on " << TargetToStr(inst->kernel()->target()) << "/"
             << PrecisionToStr(inst->kernel()->precision()) << "/"
-            << DataLayoutToStr(inst->kernel()->layout());
+            << DataLayoutToStr(inst->kernel()->layout())
+            << ", write_result_to_file_:" << write_result_to_file_;
 
     std::string kernel_repr = inst->op()->op_info()->Repr();
     std::string kernel_place = TargetToStr(inst->kernel()->target()) + "/" +
@@ -404,6 +496,7 @@ class PrecisionProfiler {
           std::string mean_str{"unused"};
           std::string std_dev_str{"unused"};
           std::string ave_grow_rate_str{"unused"};
+          std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);
 
           if (!is_unused(tout)) {
             compute_tensor_precision_info(tout,
@@ -413,14 +506,14 @@ class PrecisionProfiler {
                                           &mean,
                                           &std_dev,
                                           &ave_grow_rate,
-                                          out_name,
-                                          write_result_to_file);
+                                          new_out_name,
+                                          write_result_to_file_);
             mean_str = std::to_string(mean);
             std_dev_str = std::to_string(std_dev);
             ave_grow_rate_str = std::to_string(ave_grow_rate);
           }
           std::string kernel_info = op_name + ":" + kernel_place;
-          std::string output_arg_info = out_name + ":" +
+          std::string output_arg_info = new_out_name + ":" +
                                         TargetToStr(type->target()) + "/" +
                                         PrecisionToStr(type->precision()) +
                                         "/" + DataLayoutToStr(type->layout());
@@ -441,6 +534,7 @@ class PrecisionProfiler {
             std::string mean_str{"unused"};
             std::string std_dev_str{"unused"};
             std::string ave_grow_rate_str{"unused"};
+            std::string new_out_name = rename_out_for_mem_reuse_pass(out_name);
 
             if (!is_unused(tout)) {
               compute_tensor_precision_info(tout,
@@ -450,14 +544,14 @@ class PrecisionProfiler {
                                             &mean,
                                             &std_dev,
                                             &ave_grow_rate,
-                                            out_name,
-                                            write_result_to_file);
+                                            new_out_name,
+                                            write_result_to_file_);
               mean_str = std::to_string(mean);
               std_dev_str = std::to_string(std_dev);
               ave_grow_rate_str = std::to_string(ave_grow_rate);
             }
             std::string kernel_info = op_name + ":" + kernel_place;
-            std::string output_arg_info = out_name + ":" +
+            std::string output_arg_info = new_out_name + ":" +
                                           TargetToStr(type->target()) + "/" +
                                           PrecisionToStr(type->precision()) +
                                           "/" + DataLayoutToStr(type->layout());
@@ -471,12 +565,16 @@ class PrecisionProfiler {
         }
       }
     }
-    write_precision_summary_tofile(ss.str(), log_dir_);
+    write_precision_summary_tofile(ss.str(), summary_log_dir_);
     return ss.str();
   }
 
  private:
-  std::string log_dir_{"/storage/emulated/0/precision.log"};
+  std::string log_dir_{"/storage/emulated/0/PaddleLite_" + get_date_str() +
+                       "/"};
+  std::string summary_log_dir_{log_dir_ + "precision_summary.log"};
+  std::map<std::string, size_t> out_tensor_names_map;
+  bool write_result_to_file_{false};
 };
 
 }  // namespace profile
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 14855e778d118b1294c12ea7748cf75933c3fda7..c8ecb064334b147fc16faa3a9467687457730cfa 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -302,7 +302,9 @@ void RuntimeProgram::Run() {
   LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
 #endif
 #ifdef LITE_WITH_PRECISION_PROFILE
-  LOG(INFO) << "\n" << precision_profiler_summary;
+  LOG(INFO) << "\n"
+            << precision_profiler_summary
+            << inst_precision_profiler.GetSummaryTail();
 #endif
 }
 
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index 6427f4c46d66fd9e72de7351fdf3ab58946e8ac5..73a5ea7655f3759b65b592771b6a09452d492250 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -29,6 +29,21 @@ int64_t ShapeProduction(const shape_t& shape) {
   return res;
 }
 
+std::string ShapePrint(const std::vector<shape_t>& shapes) {
+  std::string shapes_str{""};
+  for (size_t shape_idx = 0; shape_idx < shapes.size(); ++shape_idx) {
+    auto shape = shapes[shape_idx];
+    std::string shape_str;
+    for (auto i : shape) {
+      shape_str += std::to_string(i) + ",";
+    }
+    shapes_str += shape_str;
+    shapes_str +=
+        (shape_idx != 0 && shape_idx == shapes.size() - 1) ? "" : " : ";
+  }
+  return shapes_str;
+}
+
 std::string ShapePrint(const shape_t& shape) {
   std::string shape_str{""};
   for (auto i : shape) {
@@ -37,6 +52,37 @@ std::string ShapePrint(const shape_t& shape) {
   return shape_str;
 }
 
+std::vector<std::string> split_string(const std::string& str_in) {
+  std::vector<std::string> str_out;
+  std::string tmp_str = str_in;
+  while (!tmp_str.empty()) {
+    size_t next_offset = tmp_str.find(":");
+    str_out.push_back(tmp_str.substr(0, next_offset));
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return str_out;
+}
+
+std::vector<int64_t> get_shape(const std::string& str_shape) {
+  std::vector<int64_t> shape;
+  std::string tmp_str = str_shape;
+  while (!tmp_str.empty()) {
+    int dim = atoi(tmp_str.data());
+    shape.push_back(dim);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return shape;
+}
+
 template <typename T>
 double compute_mean(const T* in, const size_t length) {
   double sum = 0.;
@@ -70,7 +116,7 @@ inline double GetCurrentUS() {
 }
 
 void RunModel(std::string model_dir,
-              const shape_t& input_shape,
+              const std::vector<shape_t>& input_shapes,
               size_t repeats,
               size_t warmup,
               size_t print_output_elem,
@@ -111,12 +157,19 @@ void RunModel(std::string model_dir,
       CreatePaddlePredictor<MobileConfig>(config);
 
   // 3. Prepare input data
-  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
-  input_tensor->Resize(
-      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
-    data[i] = 1;
+  std::cout << "input_shapes.size():" << input_shapes.size() << std::endl;
+  for (int j = 0; j < input_shapes.size(); ++j) {
+    auto input_tensor = predictor->GetInput(j);
+    input_tensor->Resize(input_shapes[j]);
+    auto input_data = input_tensor->mutable_data<float>();
+    int input_num = 1;
+    for (int i = 0; i < input_shapes[j].size(); ++i) {
+      input_num *= input_shapes[j][i];
+    }
+
+    for (int i = 0; i < input_num; ++i) {
+      input_data[i] = 1.f;
+    }
   }
 
   // 4. Run predictor
@@ -142,7 +195,7 @@ void RunModel(std::string model_dir,
   }
   avg_duration = sum_duration / static_cast<float>(repeats);
   std::cout << "\n======= benchmark summary =======\n"
-            << "input_shape(NCHW):" << ShapePrint(input_shape) << "\n"
+            << "input_shape(s) (NCHW):" << ShapePrint(input_shapes) << "\n"
             << "model_dir:" << model_dir << "\n"
             << "warmup:" << warmup << "\n"
             << "repeats:" << repeats << "\n"
@@ -184,18 +237,19 @@ void RunModel(std::string model_dir,
 }
 
 int main(int argc, char** argv) {
-  shape_t input_shape{1, 3, 224, 224};  // shape_t ==> std::vector<int64_t>
+  std::vector<std::string> str_input_shapes;
+  std::vector<shape_t> input_shapes{
+      {1, 3, 224, 224}};  // shape_t ==> std::vector<int64_t>
+
   int repeats = 10;
   int warmup = 10;
   int print_output_elem = 0;
 
-  if (argc > 2 && argc < 9) {
+  if (argc > 2 && argc < 6) {
     std::cerr << "usage: ./" << argv[0] << "\n"
               << "  <naive_buffer_model_dir>\n"
-              << "  <input_n>\n"
-              << "  <input_c>\n"
-              << "  <input_h>\n"
-              << "  <input_w>\n"
+              << "  <raw_input_shapes>, eg: 1,3,224,224 for 1 input; "
+                 "1,3,224,224:1,5 for 2 inputs\n"
               << "  <repeats>\n"
               << "  <warmup>\n"
               << "  <print_output>" << std::endl;
@@ -203,14 +257,19 @@ int main(int argc, char** argv) {
   }
 
   std::string model_dir = argv[1];
-  if (argc >= 9) {
-    input_shape[0] = atoi(argv[2]);
-    input_shape[1] = atoi(argv[3]);
-    input_shape[2] = atoi(argv[4]);
-    input_shape[3] = atoi(argv[5]);
-    repeats = atoi(argv[6]);
-    warmup = atoi(argv[7]);
-    print_output_elem = atoi(argv[8]);
+  if (argc >= 6) {
+    input_shapes.clear();
+    std::string raw_input_shapes = argv[2];
+    std::cout << "raw_input_shapes: " << raw_input_shapes << std::endl;
+    str_input_shapes = split_string(raw_input_shapes);
+    for (size_t i = 0; i < str_input_shapes.size(); ++i) {
+      std::cout << "input shape: " << str_input_shapes[i] << std::endl;
+      input_shapes.push_back(get_shape(str_input_shapes[i]));
+    }
+
+    repeats = atoi(argv[3]);
+    warmup = atoi(argv[4]);
+    print_output_elem = atoi(argv[5]);
   }
   // set arm power mode:
   // 0 for big cluster, high performance
@@ -220,7 +279,7 @@ int main(int argc, char** argv) {
   size_t power_mode = 0;
 
   RunModel(
-      model_dir, input_shape, repeats, warmup, print_output_elem, power_mode);
+      model_dir, input_shapes, repeats, warmup, print_output_elem, power_mode);
 
   return 0;
 }
diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc
index 1fe632d387cb5ed7a94ad1fcc37d4313b452d368..0e00a02260f11a05dd73d8e3850c3967533e243b 100644
--- a/lite/demo/cxx/test_cv/test_img_prepross.cc
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
@@ -128,7 +128,7 @@ bool test_convert(bool cv_run,
   for (int i = 0; i < test_iter; i++) {
     clock_t begin = clock();
     // resize default linear
-    image_preprocess.imageConvert(src, resize_lite);
+    image_preprocess.image_convert(src, resize_lite);
     clock_t end = clock();
     to_lite += (end - begin);
   }
@@ -226,7 +226,7 @@ bool test_flip(bool cv_run,
   for (int i = 0; i < test_iter; i++) {
     clock_t begin = clock();
     // resize default linear
-    image_preprocess.imageFlip(src, resize_lite);
+    image_preprocess.image_flip(src, resize_lite);
     clock_t end = clock();
     to_lite += (end - begin);
   }
@@ -330,7 +330,7 @@ bool test_rotate(bool cv_run,
   for (int i = 0; i < test_iter; i++) {
     clock_t begin = clock();
     // resize default linear
-    image_preprocess.imageRotate(src, resize_lite);
+    image_preprocess.image_rotate(src, resize_lite);
     clock_t end = clock();
     to_lite += (end - begin);
   }
@@ -426,7 +426,7 @@ bool test_resize(bool cv_run,
   for (int i = 0; i < test_iter; i++) {
     clock_t begin = clock();
     // resize default linear
-    image_preprocess.imageResize(src, resize_lite);
+    image_preprocess.image_resize(src, resize_lite);
     clock_t end = clock();
     to_lite += (end - begin);
   }
@@ -526,7 +526,7 @@ bool test_crop(bool cv_run,
   std::cout << "lite compute:" << std::endl;
   for (int i = 0; i < test_iter; i++) {
     clock_t begin = clock();
-    image_preprocess.imageCrop(
+    image_preprocess.image_crop(
         src, resize_lite, dstFormat, srcw, srch, left_x, left_y, dstw, dsth);
     clock_t end = clock();
     to_lite += (end - begin);
diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc
index caa085eecb81e54859c1bdd5cd7c0654175b7a9a..6da35ea26f13384fc663b7103d4f082ae96587bd 100644
--- a/lite/demo/cxx/test_cv/test_model_cv.cc
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
@@ -88,13 +88,13 @@ void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
   uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3];
   uint8_t* resize_ptr = new uint8_t[width * height * 3];
   // do convert bgr--rgb
-  img_process.imageConvert(img_ptr, rgb_ptr);
+  img_process.image_convert(img_ptr, rgb_ptr);
   // do resize
-  img_process.imageResize(rgb_ptr, resize_ptr);
+  img_process.image_resize(rgb_ptr, resize_ptr);
   // data--tensor and normalize
   float means[3] = {103.94f, 116.78f, 123.68f};
   float scales[3] = {0.017f, 0.017f, 0.017f};
-  img_process.image2Tensor(
+  img_process.image_to_tensor(
       resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales);
   float* data = dstTensor.mutable_data<float>();
 #else
diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt
old mode 100644
new mode 100755
index 0b42af5a6fe79bbb8417c2a6a37a86c30f4a0f8b..609bf1b4b345f8eb7d14b9bb3291e6bc5bad2293
--- a/lite/kernels/apu/bridges/CMakeLists.txt
+++ b/lite/kernels/apu/bridges/CMakeLists.txt
@@ -14,6 +14,8 @@ lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_br
 lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_apu SRCS concat_op.cc DEPS ${apu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_apu SRCS conv_transpose_op.cc DEPS ${apu_subgraph_bridge_deps})
 
 
 set(apu_subgraph_bridges
@@ -25,6 +27,8 @@ set(apu_subgraph_bridges
         subgraph_bridge_softmax_op_apu
         subgraph_bridge_fc_op_apu
         subgraph_bridge_pool_op_apu
+	subgraph_bridge_conv_transpose_op_apu
+	subgraph_bridge_concat_op_apu
         CACHE INTERNAL "apu_subgraph_bridges")
 
 message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
diff --git a/lite/kernels/apu/bridges/concat_op.cc b/lite/kernels/apu/bridges/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26f62101ab435059cde043c807f92cb3ba43dd01
--- /dev/null
+++ b/lite/kernels/apu/bridges/concat_op.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+
+  // Get input and output vars and op attributes
+  auto x_names = op_info->Input("X");
+  auto out_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<int>("axis");
+  auto num = x_names.size();
+
+  // Process data layout axis change
+  if (axis == 1)
+    axis = 3;
+  else if (axis == 2)
+    axis = 1;
+  else if (axis == 3)
+    axis = 2;
+
+  // Limitation:
+  // All input tensors of NEURON_TENSOR_QUANT8_ASYMM must
+  // have the same scale and zeroPoint as the output tensor
+  CHECK(op_info->HasOutputScale(out_name));
+  auto output_scale = op_info->GetOutputScale(out_name)[0];
+
+  // Traverse all of input nodes
+  std::vector<std::shared_ptr<Node>> input_nodes;
+  NeuronOperandType xType;
+  for (auto& x_name : x_names) {
+    auto x = scope->FindMutableTensor(x_name);
+    auto x_dims = x->dims();
+    std::shared_ptr<Node> x_node = nullptr;
+
+    CHECK(op_info->HasInputScale(x_name));
+    auto input_scale = op_info->GetInputScale(x_name)[0];
+
+    // Add x tensor type
+    xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+    xType.scale = input_scale;
+    xType.zeroPoint = 128;
+    xType.dimensionCount = x_dims.size();
+    std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                    (uint32_t)x_dims[2],
+                                    (uint32_t)x_dims[3],
+                                    (uint32_t)x_dims[1]};
+    xType.dimensions = &dims_x[0];
+    if (graph->Has(x_name)) {
+      VLOG(3) << "Graph has " << x_name;
+      if (graph->IsInput(x_name)) {
+        VLOG(3) << x_name << "is input and already exist";
+        x_name = "transpose_" + x_name;
+      }
+
+      if (graph->IsOutput(x_name)) {
+        VLOG(3) << x_name << "is input and output node";
+        x_name = "transpose_" + x_name;
+      }
+      x_node = graph->Get(x_name);
+    } else {
+      // Add input operand
+      if (graph->IsInput(x_name)) {
+        // Insert transpose for NCHW -> NHWC
+        insert_transpose_node(ctx,
+                              x_name,
+                              "transpose_" + x_name,
+                              {(uint32_t)x_dims[0],
+                               (uint32_t)x_dims[1],
+                               (uint32_t)x_dims[2],
+                               (uint32_t)x_dims[3]},
+                              dims_x,
+                              {0, 2, 3, 1},
+                              xType.scale,
+                              xType.zeroPoint);
+
+        // Change x_name because we add transpose op
+        x_name = "transpose_" + x_name;
+        x_node = graph->Get(x_name);
+      } else {
+        NeuronModel_addOperand(model, &xType);
+        x_node = graph->Add(x_name, dims_x);
+      }
+    }  // End of else
+    if (x_node == nullptr) return subgraph::FAILED;
+    input_nodes.push_back(x_node);
+
+    VLOG(3) << "input node x: " << x_node->index()
+            << ": input_scale: " << input_scale << " x_dims:" << x_dims[0]
+            << ":" << x_dims[1] << ":" << x_dims
+            << ", inType: " << xType.dimensions[0] << ":" << xType.dimensions[1]
+            << ":" << xType.dimensions[2] << ":" << xType.dimensions[3];
+  }  // End of for
+
+  if (input_nodes.size() != num) {
+    LOG(WARNING) << "Create input operand failed!";
+    return subgraph::FAILED;
+  }
+
+  // Add axis operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  // Add axis operand
+  std::shared_ptr<Node> axis_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // axis
+  axis_node = graph->Add(out_name + "_axis", dims_int32);
+  VLOG(3) << "axis:" << axis;
+
+  // Add out operand type
+  auto out = scope->FindMutableTensor(out_name);
+  auto out_dims = out->dims();
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+
+  // Add out operand
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index()
+          << ": output_scle: " << outType.scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Set axis value
+  int32_t axis_val[1] = {(int32_t)axis};
+  NeuronModel_setOperandValue(
+      model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex;
+  for (auto& node : input_nodes) {
+    addInIndex.push_back(node->index());
+  }
+
+  addInIndex.push_back(axis_node->index());
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_CONCATENATION,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return subgraph::FAILED;
+  }
+
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return subgraph::FAILED;
+  }
+
+  return SUCCESS;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConcatConverter);
diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc
index bdac473b1b954a1dea38fa5d884863d863bc6b67..bb60331e44d94afaffac2dd42020c4b4c7b4309d 100644
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -60,9 +60,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(output_dims[0], bs);
   CHECK_EQ(output_dims[1], oc);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto groups = op_info->GetAttr<int>("groups");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
   bool with_act =
       op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
   std::string act_type =
@@ -73,7 +73,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(strides.size(), 2L);
   CHECK_EQ(dilations.size(), 2L);
   bool is_depthwise_mode = ic == groups && oc == groups;
-  VLOG(3) << "is_depthwise_mode" << is_depthwise_mode;
+  VLOG(3) << "is_depthwise_mode: " << is_depthwise_mode;
 
   if (paddings.size() == 2L) {
     for (size_t i = 0; i < strides.size(); ++i) {
@@ -103,6 +103,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto filter_scale = op_info->GetInputScale(filter_name);
   CHECK(op_info->HasOutputScale(output_name));
   auto output_scale = op_info->GetOutputScale(output_name)[0];
+  auto orig_output_scale = op_info->GetOutputScale(output_name)[0];
 
   VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
           << " ,dilations: " << dilations[0] << ":" << dilations[1];
@@ -128,23 +129,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> input_node = nullptr;
   if (graph->Has(input_name)) {
     VLOG(3) << "Graph has " << input_name;
-    // input operand already exist
+
+    if (graph->IsInput(input_name)) {
+      VLOG(3) << input_name << "is input and already exist";
+      input_name = "transpose_" + input_name;
+    }
+
+    if (graph->IsOutput(input_name)) {
+      VLOG(3) << input_name << "is input and output node";
+      input_name = "transpose_" + input_name;
+    }
     input_node = graph->Get(input_name);
   } else {
-    // add input operand
     if (graph->IsInput(input_name)) {
       // Insert transpose for NCHW -> NHWC
-      insert_transpose_node(
-          ctx,
-          input_name,
-          "transpose_" + input_name,
-          {input_dims[0], input_dims[1], input_dims[2], input_dims[3]},
-          dims_in,
-          {0, 2, 3, 1},
-          inType.scale,
-          inType.zeroPoint);
-
-      // change input_name
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+
       input_name = "transpose_" + input_name;
       input_node = graph->Get(input_name);
       if (input_node == nullptr) return subgraph::FAILED;
@@ -153,7 +163,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       input_node = graph->Add(input_name, dims_in);
     }
   }
-  VLOG(3) << "input node idx" << input_node->index()
+  VLOG(3) << "input node idx: " << input_node->index()
           << ": input_scale: " << input_scale
           << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
           << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
@@ -161,8 +171,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add bias type
   NeuronOperandType biasType;
 
-  // Add filter type
-  // filter NCHW -> NHWC
+  // Add filter type, filter data re-layout NCHW -> NHWC
   Tensor transpose_filter;
   std::vector<uint32_t> dims_filter;
 
@@ -233,10 +242,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     biasType.scale = 0;
   }
 
+  auto precision = filter->precision();
   std::shared_ptr<Node> filter_node = nullptr;
   if (1 == filter_scale.size()) {
-    NeuronModel_addOperand(model, &filterType);  // 1: filter
-    filter_node = graph->Add(filter_name, dims_filter);
+    NeuronModel_addOperand(model, &filterType);
+    filter_node = graph->Add(filter_name, dims_filter);  // Operand 1: filter
     VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
             << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
             << ":" << filterType.dimensions[1] << ":"
@@ -251,7 +261,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       return subgraph::FAILED;
     }
   } else {
-    NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
+    NeuronModel_addOperand(model, &channelFilterType);  // Operand 1: filter
     filter_node = graph->Add(filter_name, dims_filter);
     VLOG(3) << "chennel filter node idx: " << filter_node->index()
             << " ,scale_count:" << filter_scale.size()
@@ -280,7 +290,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add biasType node value
   // A 1-D tensor, of shape [depth_out], specifying the bias.
   // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias
-  // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0
+  // should be of NEURON_TENSOR_INT32, with zeroPoint of 0
   // and bias_scale of 0. The actual scale of each value 'i' is equal
   // to bias_scale[i] = input_scale * filter_scale[i].
   biasType.type = NEURON_TENSOR_INT32;
@@ -296,16 +306,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     for (int i = 0; i < bias_dims.size(); i++)
       dims_bias.push_back(bias_dims[i]);
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(bias_name, dims_bias);
-    VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
             << " ,bias scale: " << biasType.scale
             << " ,dimensions: " << bias_dims;
   } else {
     biasType.dimensionCount = 1;
     dims_bias = {(uint32_t)output_dims[1]};
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
     VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias "
             << " ,bias scale: " << biasType.scale
@@ -318,39 +329,51 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::vector<uint32_t> dims_int32 = {1};
 
   std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
   paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
 
   std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
   paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
 
   std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
   paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
 
   std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
   paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
 
   std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
   strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
 
   std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
   strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
 
   std::shared_ptr<Node> dm_node = nullptr;
   if (is_depthwise_mode) {
-    NeuronModel_addOperand(model, &int32Type);  // 9: depthwise multiplier
+    NeuronModel_addOperand(model,
+                           &int32Type);  // Operand 9: depthwise multiplier
     dm_node = graph->Add(filter_name + "_dm", dims_int32);
   }
 
   std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9/10: fuse
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9/10: fuse
   fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
 
+  /* Check output scale */
+  if (is_depthwise_mode) {
+    for (auto s : filter_scale) {
+      if (output_scale < s * input_scale)
+        output_scale = s * input_scale + 0.000001;
+    }
+#ifdef LITE_MEDIATEK_APU_ENABLE_REQUANT
+    output_scale = orig_output_scale;
+#endif
+  }
+
   // Add output tensor type
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -366,12 +389,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (graph->Has(output_name)) {
     output_node = graph->Get(output_name);
   } else {
-    // add output operand
-    if (graph->IsOutput(output_name)) {
-      NeuronModel_addOperand(model, &outType);  // output
+    // Add output operand
+    NeuronModel_addOperand(model, &outType);
+
+    if (orig_output_scale != output_scale) {
+      // Need to insert requant op, the result is requant_ -> transpose_ ->
+      // output
+      output_node = graph->Add("requant_" + output_name, dims_out);
+    } else if (graph->IsOutput(output_name)) {
+      // Need to insert transpose op, transpose_ -> output
       output_node = graph->Add("transpose_" + output_name, dims_out);
     } else {
-      NeuronModel_addOperand(model, &outType);  // output
       output_node = graph->Add(output_name, dims_out);
     }
   }
@@ -433,10 +461,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Add Stride
   int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
   NeuronModel_setOperandValue(
       model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
   NeuronModel_setOperandValue(
       model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
 
@@ -460,7 +488,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         model, dm_node->index(), &dm, sizeof(int32_t) * 1);
     VLOG(3) << "depthwise multiplier:" << dm;
 
-    // Depthwise conv
+    // Depthwise conv case
     NeuronModel_setOperandValue(
         model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
     std::vector<uint32_t> addInIndex = {
@@ -512,19 +540,46 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     return FAILED;
   }
 
+  // Check if Requant OP is needed
+  std::shared_ptr<Node> requant_node = nullptr;
+  if (orig_output_scale != output_scale) {
+    std::string requant_out_name = output_name;
+    VLOG(3) << "Insert requant output scale, orig:" << orig_output_scale
+            << " ,output_scale:" << output_scale;
+    if (graph->IsOutput(output_name)) {
+      requant_out_name = "transpose_" + output_name;
+    }
+
+    insert_requant_node(ctx,
+                        "requant_" + output_name,
+                        requant_out_name,
+                        dims_out,
+                        dims_out,
+                        output_scale,
+                        orig_output_scale,
+                        outType.zeroPoint);
+
+    requant_node = graph->Get(requant_out_name);
+    if (requant_node == nullptr) return subgraph::FAILED;
+  }
+
+  std::shared_ptr<Node> transpose_node = nullptr;
   if (graph->IsOutput(output_name)) {
+    VLOG(3) << "Add output transpose:" << output_name;
     // Insert transpose for NHWC -> NCHW
-    insert_transpose_node(
-        ctx,
-        "transpose_" + output_name,
-        output_name,
-        dims_out,
-        {output_dims[0], output_dims[1], output_dims[2], output_dims[3]},
-        {0, 3, 1, 2},
-        outType.scale,
-        outType.zeroPoint);
-    output_node = graph->Get(output_name);
-    if (output_node == nullptr) return subgraph::FAILED;
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    transpose_node = graph->Get(output_name);
+    if (transpose_node == nullptr) return subgraph::FAILED;
   }
 
   return REBUILD_WHEN_SHAPE_CHANGED;
diff --git a/lite/kernels/apu/bridges/conv_transpose_op.cc b/lite/kernels/apu/bridges/conv_transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..386c89c128e476611ebde4b337823775b5ae01a9
--- /dev/null
+++ b/lite/kernels/apu/bridges/conv_transpose_op.cc
@@ -0,0 +1,488 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/subgraph_bridge_registry.h"
+#include "lite/kernels/apu/bridges/graph.h"
+#include "lite/kernels/apu/bridges/utility.h"
+
+#include "lite/operators/conv_op.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace apu {
+
+int ConvTransposeConverter(void *ctx, OpLite *op, KernelBase *kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph *>(ctx);
+  auto model = graph->model();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" << op_type << "]";
+
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  CHECK_EQ(input_dims.size(), 4);
+
+  auto filter_name = op_info->Input("Filter").front();
+  auto filter = scope->FindMutableTensor(filter_name);
+  auto filter_dims = filter->dims();
+  CHECK_EQ(filter_dims.size(), 4);
+
+  auto output_name = op_info->Output("Output").front();
+
+  auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK_EQ(strides.size(), 2L);
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  auto groups = op_info->GetAttr<int>("groups");
+  if (groups > 1) {
+    LOG(WARNING) << "[NPU] only support groups == 1";
+    return FAILED;
+  }
+
+  bool with_act =
+      op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
+  std::string act_type =
+      with_act ? op_info->GetAttr<std::string>("act_type") : "";
+  float leaky_relu_alpha = act_type == "leaky_relu"
+                               ? op_info->GetAttr<float>("leaky_relu_alpha")
+                               : 0.f;
+  auto fuse_relu =
+      op_info->HasAttr("fuse_relu") && op_info->GetAttr<bool>("fuse_relu");
+
+  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  CHECK_EQ(dilations.size(), 2L);
+  std::string padding_algorithm =
+      op_info->HasAttr("padding_algorithm")
+          ? op_info->GetAttr<std::string>("padding_algorithm")
+          : "";
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+
+  CHECK_EQ(paddings.size(), 4L)
+      << "[APU] Paddings size should be the same or twice as the input size."
+      << paddings.size();
+
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
+  std::vector<int> output_dims;
+  // Set output_dims: batches
+  output_dims.push_back(input_dims[0]);
+
+  std::vector<int> output_size;
+  if (op_info->HasAttr("output_size")) {
+    output_size = op_info->GetAttr<std::vector<int>>("output_size");
+  }
+
+  if (output_size.size() > 2) {
+    // Set output_dims: height, width
+    output_dims.push_back(output_size[0]);
+    output_dims.push_back(output_size[1]);
+  } else {
+    // Compute output size
+    for (int i = 0; i < strides.size(); i++) {
+      int kernel_ext = filter_dims[i + 2];
+      int output_size = (input_dims[i + 2] - 1) * strides[i] + kernel_ext -
+                        paddings[i * 2] - paddings[i * 2 + 1];
+      output_dims.push_back(output_size);
+    }
+  }
+  output_dims.push_back(filter_dims[1]);
+
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];
+
+  VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
+          << " ,dilations: " << dilations[0] << ":" << dilations[1];
+  VLOG(3) << "with_act: " << with_act << " ,act_type: " << act_type;
+  VLOG(3) << "input_dims: " << input_dims
+          << " ,filter_scale size: " << filter_scale.size();
+  VLOG(3) << "filter_dims(Cin, Cout, H, W): " << filter_dims
+          << " ,memory_size: " << filter->memory_size()
+          << " ,data_size: " << filter->data_size();
+
+  // Add input tensor type
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = input_scale;
+  inType.zeroPoint = 128;
+  inType.dimensionCount = input_dims.size();
+  std::vector<uint32_t> dims_in = {(uint32_t)input_dims[0],
+                                   (uint32_t)input_dims[2],
+                                   (uint32_t)input_dims[3],
+                                   (uint32_t)input_dims[1]};
+  inType.dimensions = &dims_in[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(3) << "Graph has " << input_name;
+    // Input operand already created by previous OP
+    input_node = graph->Get(input_name);
+  } else {
+    // Add input operand
+    if (graph->IsInput(input_name)) {
+      // Insert transpose for NCHW -> NHWC
+      insert_transpose_node(ctx,
+                            input_name,
+                            "transpose_" + input_name,
+                            {(uint32_t)input_dims[0],
+                             (uint32_t)input_dims[1],
+                             (uint32_t)input_dims[2],
+                             (uint32_t)input_dims[3]},
+                            dims_in,
+                            {0, 2, 3, 1},
+                            inType.scale,
+                            inType.zeroPoint);
+
+      // Change input_name because we add transpose op
+      input_name = "transpose_" + input_name;
+      input_node = graph->Get(input_name);
+      if (input_node == nullptr) return subgraph::FAILED;
+    } else {
+      NeuronModel_addOperand(model, &inType);
+      input_node = graph->Add(input_name, dims_in);
+    }
+  }
+
+  VLOG(3) << "input node idx: " << input_node->index()
+          << ": input_scale: " << input_scale
+          << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1]
+          << ":" << inType.dimensions[2] << ":" << inType.dimensions[3];
+
+  // Add bias type
+  NeuronOperandType biasType;
+
+  // Add filter type
+  // Relay out filter (Cin,Cout,H,W) -> (depth_out, h, w, depth_in)
+  Tensor transpose_filter;
+  std::vector<uint32_t> dims_filter;
+  transpose_filter.Resize({(uint32_t)filter_dims[1],
+                           (uint32_t)filter_dims[2],
+                           (uint32_t)filter_dims[3],
+                           (uint32_t)filter_dims[0]});
+
+  transposeAsym(filter->data<int8_t>(),
+                transpose_filter.mutable_data<uint8_t>(),
+                {(uint32_t)filter_dims[0],
+                 (uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3]},
+                {1, 2, 3, 0});
+
+  dims_filter = {(uint32_t)filter_dims[1],
+                 (uint32_t)filter_dims[2],
+                 (uint32_t)filter_dims[3],
+                 (uint32_t)filter_dims[0]};
+
+  NeuronOperandType filterType;
+  filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  filterType.scale = filter_scale[0];
+  filterType.zeroPoint = 128;
+  filterType.dimensionCount = filter_dims.size();
+  filterType.dimensions = &dims_filter[0];
+  biasType.scale = inType.scale * filterType.scale;
+
+  std::shared_ptr<Node> filter_node = nullptr;
+  NeuronModel_addOperand(model, &filterType);
+  filter_node = graph->Add(filter_name, dims_filter);
+  auto precision = filter->precision();
+  VLOG(3) << " filter node idx: " << filter_node->index()
+          << " filter_scale[0]=" << filter_scale[0]
+          << " filter memory_size=" << filter->memory_size()
+          << " filter precision=" << PrecisionToStr(precision)
+          << " :filterType: " << filterType.dimensions[0] << ":"
+          << filterType.dimensions[2] << ":" << filterType.dimensions[2] << ":"
+          << filterType.dimensions[3];
+
+  memcpy(filter->mutable_data<int8_t>(),
+         transpose_filter.mutable_data<uint8_t>(),
+         filter->memory_size());
+
+  // Set filter value
+  neuron_errCode = NeuronModel_setOperandValue(
+      model, filter_node->index(), filter->raw_data(), filter->memory_size());
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  // Add biasType node value
+  // A 1-D tensor, of shape [depth_out], specifying the bias.
+  // For filter tensor of NEURON_TENSOR_QUANT8_ASYMM, the bias should be of
+  // NEURON_TENSOR_INT32 with zeroPoint of 0 and bias_scale ==
+  // input_scale * filter_scale
+  biasType.type = NEURON_TENSOR_INT32;
+  biasType.zeroPoint = 0;
+  std::vector<uint32_t> dims_bias;
+  std::shared_ptr<Node> bias_node = nullptr;
+
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    auto channel_size = bias->dims().production();
+    CHECK_EQ(channel_size, filter_dims[1] * groups);
+    CHECK_EQ(bias_dims.size(), 1);
+
+    biasType.dimensionCount = bias_dims.size();
+    for (int i = 0; i < bias_dims.size(); i++)
+      dims_bias.push_back(bias_dims[i]);
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(bias_name, dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: " << bias_name
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << bias_dims
+            << " ,channel_size:" << channel_size;
+
+  } else {
+    // Create default bias with value 0
+    biasType.dimensionCount = 1;
+    dims_bias = {(uint32_t)output_dims[1]};
+    biasType.dimensions = &dims_bias[0];
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
+    bias_node = graph->Add(filter_name + "_default_bias", dims_bias);
+    VLOG(3) << "node idx: " << bias_node->index()
+            << ": Bias name: default_bias "
+            << " ,bias scale: " << biasType.scale
+            << " ,dimensions: " << dims_bias.size();
+  }
+
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  std::shared_ptr<Node> paddingL_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding left
+  paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32);
+
+  std::shared_ptr<Node> paddingR_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding right
+  paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32);
+
+  std::shared_ptr<Node> paddingT_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: padding top
+  paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32);
+
+  std::shared_ptr<Node> paddingB_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: padding bottom
+  paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32);
+
+  std::shared_ptr<Node> strideW_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: stride width
+  strideW_node = graph->Add(filter_name + "_stride_width", dims_int32);
+
+  std::shared_ptr<Node> strideH_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: stride height
+  strideH_node = graph->Add(filter_name + "_stride_height", dims_int32);
+
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(filter_name + "_fuse", dims_int32);
+
+  NeuronOperandType boolType;
+  boolType.type = NEURON_BOOL;
+  boolType.dimensionCount = 0;  // Must be 0 for scalars.
+  std::shared_ptr<Node> layout_node = nullptr;
+  NeuronModel_addOperand(model, &boolType);  // Operand 9: fuse
+  layout_node = graph->Add(filter_name + "_layout", dims_int32);
+
+  // Add output tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = output_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = output_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
+                                    (uint32_t)output_dims[1],
+                                    (uint32_t)output_dims[2],
+                                    (uint32_t)output_dims[3]};
+  outType.dimensions = &dims_out[0];
+  std::shared_ptr<Node> output_node = nullptr;
+  if (graph->Has(output_name)) {
+    output_node = graph->Get(output_name);
+  } else {
+    if (graph->IsOutput(output_name)) {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add("transpose_" + output_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      output_node = graph->Add(output_name, dims_out);
+    }
+  }
+  VLOG(3) << "output node idx: " << output_node->index()
+          << ": output_scale: " << outType.scale
+          << " ,outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Add bias value
+  if (HasInputArg(op_info, scope, "Bias")) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+
+    int32_t *int32_bias_data =
+        reinterpret_cast<int32_t *>(bias->mutable_data<float>());
+    float2int32(
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);
+
+    VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << ":"
+            << int32_bias_data[1] << ":" << int32_bias_data[2] << ":"
+            << int32_bias_data[3];
+
+    neuron_errCode = NeuronModel_setOperandValue(
+        model, bias_node->index(), bias->raw_data(), bias->memory_size());
+  } else {
+    auto int32_bias = std::make_shared<Tensor>();
+    int32_bias->Resize({1, output_dims[3]});
+    int32_bias->mutable_data<int32_t>();
+    VLOG(3) << "bais_default: " << int32_bias->memory_size();
+    memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
+    neuron_errCode = NeuronModel_setOperandValue(model,
+                                                 bias_node->index(),
+                                                 int32_bias->raw_data(),
+                                                 int32_bias->memory_size());
+    bias_node->set_data(int32_bias);
+  }
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode;
+    return subgraph::FAILED;
+  }
+
+  VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":"
+          << paddings[2] << ":" << paddings[3];
+  // Add padding value
+  int32_t padding_val[1];
+  padding_val[0] = paddings[2];
+  NeuronModel_setOperandValue(
+      model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[3];
+  NeuronModel_setOperandValue(
+      model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[0];
+  NeuronModel_setOperandValue(
+      model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1);
+  padding_val[0] = paddings[1];
+  NeuronModel_setOperandValue(
+      model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1);
+
+  VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0];
+
+  // Add Stride
+  int32_t stride_val[1];
+  stride_val[0] = strides[1];  // entry 1: width stride
+  NeuronModel_setOperandValue(
+      model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
+  stride_val[0] = strides[0];  // entry 0: height stride
+  NeuronModel_setOperandValue(
+      model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
+
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
+  if (act_type == "relu") {
+    fuse_val[0] = NEURON_FUSED_RELU;
+  } else if (act_type == "relu1") {
+    fuse_val[0] = NEURON_FUSED_RELU1;
+  } else if (act_type == "relu6") {
+    fuse_val[0] = NEURON_FUSED_RELU6;
+  } else if (!act_type.empty()) {
+    fuse_val[0] = NEURON_FUSED_NONE;
+    LOG(WARNING) << "Support act_type: " << act_type;
+    return FAILED;
+  }
+
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  bool layout_val[] = {false};
+  NeuronModel_setOperandValue(
+      model, layout_node->index(), layout_val, sizeof(bool) * 1);
+
+  std::vector<uint32_t> addInIndex = {
+      input_node->index(),     // 0: input
+      filter_node->index(),    // 1: filter
+      bias_node->index(),      // 2: bias
+      paddingL_node->index(),  // 3: padding left
+      paddingR_node->index(),  // 4: padding right
+      paddingT_node->index(),  // 5: padding top
+      paddingB_node->index(),  // 6: padding bottom
+      strideW_node->index(),   // 7: stride width
+      strideH_node->index(),   // 8: stride height
+      fuse_node->index(),      // 9: fuse
+      layout_node->index()};   // 10: layout
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+  neuron_errCode = NeuronModel_addOperation(model,
+                                            NEURON_TRANSPOSE_CONV_2D,
+                                            addInIndex.size(),
+                                            &addInIndex[0],
+                                            addOutIndex.size(),
+                                            &addOutIndex[0]);
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "Add op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(output_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + output_name,
+                          output_name,
+                          dims_out,
+                          {(uint32_t)output_dims[0],
+                           (uint32_t)output_dims[1],
+                           (uint32_t)output_dims[2],
+                           (uint32_t)output_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    output_node = graph->Get(output_name);
+    if (output_node == nullptr) return subgraph::FAILED;
+  }
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace apu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ConvTransposeConverter);
diff --git a/lite/kernels/apu/bridges/elementwise_ops.cc b/lite/kernels/apu/bridges/elementwise_ops.cc
index 964e81eb6aba26c44dd1b3cd0658984792c6259f..af8f76c68e20e1206bf16450cd04f5ecf5cf7bb9 100644
--- a/lite/kernels/apu/bridges/elementwise_ops.cc
+++ b/lite/kernels/apu/bridges/elementwise_ops.cc
@@ -29,28 +29,252 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
   auto scope = op->scope();
-  VLOG(3) << "[APU] Converting " + op_type + "...";
+  int neuron_errCode;
+  VLOG(3) << "[APU] Converting [" + op_type + "]";
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x = scope->FindMutableTensor(x_name);
+  auto x = scope->FindTensor(x_name);
   auto x_dims = x->dims();
 
   auto y_name = op_info->Input("Y").front();
-  auto y = scope->FindMutableTensor(y_name);
+  auto y = scope->FindTensor(y_name);
   auto y_dims = y->dims();
 
   auto out_name = op_info->Output("Out").front();
-  auto out = scope->FindMutableTensor(out_name);
+  auto out = scope->FindTensor(out_name);
   auto out_dims = out->dims();
+
   auto axis = op_info->GetAttr<int>("axis");
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+
+  auto x_shape = x_dims.Vectorize();
+  auto y_shape = y_dims.Vectorize();
+
+  // Two dimensions are compatible when:
+  // 1. they are equal, or
+  // 2. one of them is 1
+  for (int i = axis; i < x_shape.size(); i++) {
+    if (x_dims[i] != y_dims[i - axis]) {
+      // Input 1 compatible dimensions as input0
+      if (y_dims[i - axis] != 1) {
+        LOG(WARNING) << i << ":" << axis << ":" << y_dims[i - axis];
+        return FAILED;
+      }
+    }
+  }  // End of for
 
+  int32_t fuse_val[1] = {NEURON_FUSED_NONE};
   // Act node
   if (op_type == "fusion_elementwise_add_activation" ||
       op_type == "fusion_elementwise_sub_activation" ||
       op_type == "fusion_elementwise_mul_activation" ||
       op_type == "fusion_elementwise_div_activation") {
     auto act_type = op_info->GetAttr<std::string>("act_type");
+
+    if (act_type == "relu") {
+      fuse_val[0] = NEURON_FUSED_RELU;
+    } else if (act_type == "relu1") {
+      fuse_val[0] = NEURON_FUSED_RELU1;
+    } else if (act_type == "relu6") {
+      fuse_val[0] = NEURON_FUSED_RELU6;
+    } else if (!act_type.empty()) {
+      fuse_val[0] = NEURON_FUSED_NONE;
+      LOG(WARNING) << "Support act_type: " << act_type;
+      return FAILED;
+    }
+  }  // End of if
+  VLOG(3) << "x_name" << x_name;
+
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasInputScale(y_name));
+  auto y_scale = op_info->GetInputScale(y_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
+
+  // Add x tensor type
+  NeuronOperandType xType;
+  xType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  xType.scale = x_scale;
+  xType.zeroPoint = 128;
+  xType.dimensionCount = x_dims.size();
+  std::vector<uint32_t> dims_x = {(uint32_t)x_dims[0],
+                                  (uint32_t)x_dims[2],
+                                  (uint32_t)x_dims[3],
+                                  (uint32_t)x_dims[1]};
+  xType.dimensions = &dims_x[0];
+
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    VLOG(3) << "Graph has " << x_name;
+    if (graph->IsInput(x_name)) {
+      VLOG(3) << x_name << "is input and already exist";
+      x_name = "transpose_" + x_name;
+    }
+
+    if (graph->IsOutput(x_name)) {
+      VLOG(3) << x_name << "is input and output node";
+      x_name = "transpose_" + x_name;
+    }
+    x_node = graph->Get(x_name);
+  } else {
+    if (graph->IsInput(x_name)) {
+      insert_transpose_node(ctx,
+                            x_name,
+                            "transpose_" + x_name,
+                            {(uint32_t)x_dims[0],
+                             (uint32_t)x_dims[1],
+                             (uint32_t)x_dims[2],
+                             (uint32_t)x_dims[3]},
+                            dims_x,
+                            {0, 2, 3, 1},
+                            xType.scale,
+                            xType.zeroPoint);
+
+      // Change x name after insert transpose op for x data relayout
+      x_name = "transpose_" + x_name;
+      x_node = graph->Get(x_name);
+    } else {
+      NeuronModel_addOperand(model, &xType);
+      x_node = graph->Add(x_name, dims_x);
+    }
+  }  // End of else
+  VLOG(3) << "x node idx: " << x_node->index() << "x_dims: " << x_dims
+          << ": x_scale: " << x_scale << ", xType: " << xType.dimensions[0]
+          << ":" << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
+          << xType.dimensions[3];
+
+  // Add y tensor type
+  NeuronOperandType yType;
+  yType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  yType.scale = y_scale;
+  yType.zeroPoint = 128;
+  yType.dimensionCount = y_dims.size();
+  std::vector<uint32_t> dims_y = {(uint32_t)y_dims[0],
+                                  (uint32_t)y_dims[2],
+                                  (uint32_t)y_dims[3],
+                                  (uint32_t)y_dims[1]};
+  yType.dimensions = &dims_y[0];
+
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    VLOG(3) << "Graph has " << y_name;
+    y_node = graph->Get(y_name);
+  } else {
+    if (graph->IsInput(y_name)) {
+      insert_transpose_node(ctx,
+                            y_name,
+                            "transpose_" + y_name,
+                            {(uint32_t)y_dims[0],
+                             (uint32_t)y_dims[1],
+                             (uint32_t)y_dims[2],
+                             (uint32_t)y_dims[3]},
+                            dims_y,
+                            {0, 2, 3, 1},
+                            yType.scale,
+                            yType.zeroPoint);
+
+      y_name = "transpose_" + y_name;
+      y_node = graph->Get(y_name);
+    } else {
+      NeuronModel_addOperand(model, &yType);
+      y_node = graph->Add(y_name, dims_y);
+    }
+  }
+  VLOG(3) << "y node idx: " << y_node->index() << "y_dims: " << y_dims
+          << ": y_scale: " << y_scale << ", yType: " << yType.dimensions[0]
+          << ":" << yType.dimensions[1] << ":" << yType.dimensions[2] << ":"
+          << yType.dimensions[3];
+
+  // Add fuse operand type
+  NeuronOperandType int32Type;
+  int32Type.type = NEURON_INT32;
+  int32Type.dimensionCount = 0;
+  std::vector<uint32_t> dims_int32 = {1};
+
+  // Add fuse operand
+  std::shared_ptr<Node> fuse_node = nullptr;
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: fuse
+  fuse_node = graph->Add(out_name + "_fuse", dims_int32);
+
+  // Add out tensor type
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = out_scale;
+  outType.zeroPoint = 128;
+  outType.dimensionCount = out_dims.size();
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[2],
+                                    (uint32_t)out_dims[3],
+                                    (uint32_t)out_dims[1]};
+  outType.dimensions = &dims_out[0];
+
+  std::shared_ptr<Node> out_node = nullptr;
+  if (graph->Has(out_name)) {
+    VLOG(3) << "Graph has " << out_name;
+    out_node = graph->Get(out_name);
+  } else {
+    if (graph->IsOutput(out_name)) {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add("transpose_" + out_name, dims_out);
+    } else {
+      NeuronModel_addOperand(model, &outType);
+      out_node = graph->Add(out_name, dims_out);
+    }
+  }
+  VLOG(3) << "out node idx: " << out_node->index() << "out_dims: " << out_dims
+          << ": out_scale: " << out_scale
+          << ", outType: " << outType.dimensions[0] << ":"
+          << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
+          << outType.dimensions[3];
+
+  // Set fuse value
+  NeuronModel_setOperandValue(
+      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);
+
+  std::vector<uint32_t> addInIndex = {
+      x_node->index(),      // 0: A tensor
+      y_node->index(),      // 1: A tensor of the same OperandCode,
+                            //    and compatible dimensions as input 0
+      fuse_node->index()};  // 2: fuse
+
+  std::vector<uint32_t> addOutIndex = {out_node->index()};
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    neuron_errCode = NeuronModel_addOperation(model,
+                                              NEURON_ADD,
+                                              addInIndex.size(),
+                                              &addInIndex[0],
+                                              addOutIndex.size(),
+                                              &addOutIndex[0]);
+  } else {
+    LOG(WARNING) << "[APU] Unsupported op type: " << op_type;
+    return FAILED;
+  }
+
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(WARNING) << "ADD op fail:" << op_type;
+    return FAILED;
+  }
+
+  if (graph->IsOutput(out_name)) {
+    // Insert transpose for NHWC -> NCHW
+    insert_transpose_node(ctx,
+                          "transpose_" + out_name,
+                          out_name,
+                          dims_out,
+                          {(uint32_t)out_dims[0],
+                           (uint32_t)out_dims[1],
+                           (uint32_t)out_dims[2],
+                           (uint32_t)out_dims[3]},
+                          {0, 3, 1, 2},
+                          outType.scale,
+                          outType.zeroPoint);
+    out_node = graph->Get(out_name);
+    if (out_node == nullptr) return FAILED;
   }
 
   return REBUILD_WHEN_SHAPE_CHANGED;
@@ -67,3 +291,6 @@ REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
 REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
                          kAPU,
                          paddle::lite::subgraph::apu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kAPU,
+                         paddle::lite::subgraph::apu::ElementwiseConverter);
diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc
index 5bee94424402c52b61bdd478488a55210f9b4000..ac0d27bc7bb950f764626d509238db18857a7e64 100644
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -77,12 +77,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   inType.dimensions = &dims_in[0];
   std::shared_ptr<Node> in_node = nullptr;
   if (graph->Has(input_name)) {
-    // input operand already exist
     in_node = graph->Get(input_name);
     VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index();
   } else {
-    // add input operand
-    NeuronModel_addOperand(model, &inType);  // 0: input
+    NeuronModel_addOperand(model, &inType);  // Operand 0: input
     in_node = graph->Add(input_name, dims_in);
   }
   VLOG(3) << "input_scale: " << input_scale
@@ -97,7 +95,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   wType.dimensionCount = w_dims.size();
   std::vector<uint32_t> dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]};
   wType.dimensions = &dims_w[0];
-  NeuronModel_addOperand(model, &wType);  // 1: weight
+  NeuronModel_addOperand(model, &wType);  // Operand 1: weight
   std::shared_ptr<Node> w_node = nullptr;
   w_node = graph->Add(w_name, dims_w);
   VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0]
@@ -119,7 +117,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     biasType.dimensionCount = bias_dims.size();
     std::vector<uint32_t> dims_bias = {(uint32_t)bias_dims[0]};
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(bias_name, dims_bias);
     VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims
             << ", bias scale: " << biasType.scale
@@ -128,7 +126,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     biasType.dimensionCount = 1;
     std::vector<uint32_t> dims_bias = {(uint32_t)n};
     biasType.dimensions = &dims_bias[0];
-    NeuronModel_addOperand(model, &biasType);  // 2: bias
+    NeuronModel_addOperand(model, &biasType);  // Operand 2: bias
     bias_node = graph->Add(w_name + "_default_bias", dims_bias);
   }
 
@@ -137,7 +135,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   fuseType.type = NEURON_INT32;
   fuseType.dimensionCount = 0;
   std::vector<uint32_t> dims_int32 = {0};
-  NeuronModel_addOperand(model, &fuseType);  // 3: fuse
+  NeuronModel_addOperand(model, &fuseType);  // Operand 3: fuse
   std::shared_ptr<Node> fuse_node = nullptr;
   fuse_node = graph->Add(w_name + "_fuse", dims_int32);
 
@@ -147,12 +145,13 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   outType.scale = out_scale;
   outType.zeroPoint = 128;
   outType.dimensionCount = 2;
-  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0], out_dims[1]};
+  std::vector<uint32_t> dims_out = {(uint32_t)out_dims[0],
+                                    (uint32_t)out_dims[1]};
   outType.dimensions = &dims_out[0];
   VLOG(3) << "out_scale: " << out_scale
           << ", outType: " << outType.dimensions[0] << " : "
           << outType.dimensions[1];
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
   std::shared_ptr<Node> out_node = nullptr;
   out_node = graph->Add(out_name, dims_out);
 
@@ -190,29 +189,31 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         NeuronModel_setOperandValue(model,
                                     bias_node->index(),
                                     bias->raw_data(),
-                                    bias->memory_size());  // 2: bias
+                                    bias->memory_size());  // Operand 2: bias
   } else {
     auto int32_bias = std::make_shared<Tensor>();
     int32_bias->Resize({1, out_dims[1]});
     int32_bias->mutable_data<int32_t>();
     memset(int32_bias->mutable_data<int32_t>(), 0, int32_bias->memory_size());
     VLOG(3) << "default: " << int32_bias->memory_size();
-    neuron_errCode =
-        NeuronModel_setOperandValue(model,
-                                    bias_node->index(),
-                                    int32_bias->raw_data(),
-                                    int32_bias->memory_size());  // 2: bias
+    neuron_errCode = NeuronModel_setOperandValue(
+        model,
+        bias_node->index(),
+        int32_bias->raw_data(),
+        int32_bias->memory_size());  // Operand 2: bias
     bias_node->set_data(int32_bias);
   }
   // Add fuse value
   int32_t fuse_val[1] = {0};
-  NeuronModel_setOperandValue(
-      model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1);  // 3: fuse
-
-  std::vector<uint32_t> addInIndex = {in_node->index(),
-                                      w_node->index(),
-                                      bias_node->index(),
-                                      fuse_node->index()};
+  NeuronModel_setOperandValue(model,
+                              fuse_node->index(),
+                              fuse_val,
+                              sizeof(int32_t) * 1);  // Operand 3: fuse
+
+  std::vector<uint32_t> addInIndex = {in_node->index(),     // 0: input
+                                      w_node->index(),      // 1: weight
+                                      bias_node->index(),   // 2: bias
+                                      fuse_node->index()};  // 3: fuse
   std::vector<uint32_t> addOutIndex = {out_node->index()};
   neuron_errCode = NeuronModel_addOperation(model,
                                             NEURON_FULLY_CONNECTED,
diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc
old mode 100644
new mode 100755
index 515853aa26a1d84339c61047b5d3be20478b5ca3..ee7c92d2c2b9399b44fffd2fe8ad80618f3de526
--- a/lite/kernels/apu/bridges/graph.cc
+++ b/lite/kernels/apu/bridges/graph.cc
@@ -28,7 +28,7 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
     LOG(FATAL) << "[APU] Node" << name << " is redefined.";
     return -1;
   } else {
-    VLOG(3) << " Add: " << name << " : " << node->index();
+    VLOG(5) << " Add: " << name << " : " << node->index();
     auto ret = nodes_.insert(
         std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
     CHECK(ret.second);
diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h
old mode 100644
new mode 100755
index e3e68afc6c7c18d2b8d68361ac09de2abf2b684c..264ca8160ae4343eda7b8c7424cf26c0257512d8
--- a/lite/kernels/apu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/apu/bridges/paddle_use_bridges.h
@@ -22,3 +22,6 @@ USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU);
 USE_SUBGRAPH_BRIDGE(fc, kAPU);
 USE_SUBGRAPH_BRIDGE(pool2d, kAPU);
 USE_SUBGRAPH_BRIDGE(softmax, kAPU);
+USE_SUBGRAPH_BRIDGE(concat, kAPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kAPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kAPU);
diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc
index 594c7fabda61ff070c0e5349f5788798aaf12cf1..20691ee737ec47528b800367dca8d615f0b878a6 100644
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -45,16 +45,16 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
 
-  // pool mode
+  // Check pool mode
   if ((pooling_type == "max") || (pooling_type == "avg")) {
   } else {
     LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type;
     return FAILED;
   }
 
-  // pad mode
+  // Check padding mode
   int pad_mode = 0;
   std::string padding_algorithm("");
   if (op_info->HasAttr("padding_algorithm")) {
@@ -66,7 +66,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     pad_mode = 5;
   }
 
-  // paddings and strides
+  // Check paddings and strides
   if (paddings.size() == 2L) {
     for (size_t i = 0; i < 2L; ++i) {
       int copy_pad = *(paddings.begin() + 2 * i);
@@ -107,60 +107,59 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   xType.dimensions = &dims_x[0];
   std::shared_ptr<Node> x_node = nullptr;
   if (graph->Has(x_name)) {
-    LOG(INFO) << "Graph has " << x_name;
-    // input operand already exist
+    VLOG(3) << "Graph has " << x_name;
     x_node = graph->Get(x_name);
   } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: x
+    NeuronModel_addOperand(model, &xType);  // Operand 0: x
     x_node = graph->Add(x_name, dims_x);
   }
   VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":"
           << xType.dimensions[1] << ":" << xType.dimensions[2] << ":"
           << xType.dimensions[3];
 
+  VLOG(3) << "ksize:" << ksize[0] << ":" << ksize[1];
+
   NeuronOperandType int32Type;
   int32Type.type = NEURON_INT32;
   int32Type.dimensionCount = 0;
   std::vector<uint32_t> dims_int32 = {0};
 
   std::shared_ptr<Node> paddingL_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 1: padding left
+  NeuronModel_addOperand(model, &int32Type);  // Operand 1: padding left
   paddingL_node = graph->Add(x_name + "_padding_left", dims_int32);
 
   std::shared_ptr<Node> paddingR_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 2: padding right
+  NeuronModel_addOperand(model, &int32Type);  // Operand 2: padding right
   paddingR_node = graph->Add(x_name + "_padding_right", dims_int32);
 
   std::shared_ptr<Node> paddingT_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 3: padding top
+  NeuronModel_addOperand(model, &int32Type);  // Operand 3: padding top
   paddingT_node = graph->Add(x_name + "_padding_top", dims_int32);
 
   std::shared_ptr<Node> paddingB_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 4: padding bottom
+  NeuronModel_addOperand(model, &int32Type);  // Operand 4: padding bottom
   paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32);
 
   std::shared_ptr<Node> strideW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 5: stride width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 5: stride width
   strideW_node = graph->Add(x_name + "_stride_width", dims_int32);
 
   std::shared_ptr<Node> strideH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 6: stride height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 6: stride height
   strideH_node = graph->Add(x_name + "_stride_height", dims_int32);
 
   std::shared_ptr<Node> filterW_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 7: filter width
+  NeuronModel_addOperand(model, &int32Type);  // Operand 7: filter width
   filterW_node = graph->Add(x_name + "_filter_width", dims_int32);
 
   std::shared_ptr<Node> filterH_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 8: filter height
+  NeuronModel_addOperand(model, &int32Type);  // Operand 8: filter height
   filterH_node = graph->Add(x_name + "_filter_height", dims_int32);
 
   std::shared_ptr<Node> fuse_node = nullptr;
-  NeuronModel_addOperand(model, &int32Type);  // 9: fuse
-  fuse_node = graph->Add(x_name + "_fuse", dims_int32);
+  NeuronModel_addOperand(model, &int32Type);  // Operand 9: fuse
+  fuse_node = graph->Add(x_name + "_pool_fuse", dims_int32);
 
-  // Add out type
   // Add output tensor type
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
@@ -176,10 +175,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (graph->Has(out_name)) {
     out_node = graph->Get(out_name);
   } else {
-    NeuronModel_addOperand(model, &outType);  // out
+    NeuronModel_addOperand(model, &outType);
     out_node = graph->Add(out_name, dims_out);
   }
-  VLOG(3) << "output_scale: " << x_scale
+  VLOG(3) << "output_scale: " << out_scale
           << ", outType: " << outType.dimensions[0] << ":"
           << outType.dimensions[1] << ":" << outType.dimensions[2] << ":"
           << outType.dimensions[3];
@@ -201,19 +200,21 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Add Stride
   int32_t stride_val[1];
-  stride_val[0] = strides[1];  // width
+  stride_val[0] = strides[1];  // Entry 1: width stride
   NeuronModel_setOperandValue(
       model, strideW_node->index(), stride_val, sizeof(int32_t) * 1);
-  stride_val[0] = strides[0];  // height
+  stride_val[0] = strides[0];  // Entry 0: height stride
   NeuronModel_setOperandValue(
       model, strideH_node->index(), stride_val, sizeof(int32_t) * 1);
 
   // Add filter
   int32_t filter_val[1];
-  filter_val[0] = global_pooling ? x_dims[3] : ksize[1];  // width
+  filter_val[0] =
+      global_pooling ? x_dims[3] : ksize[1];  // Entry 1: filter width
   NeuronModel_setOperandValue(
       model, filterW_node->index(), filter_val, sizeof(int32_t) * 1);
-  filter_val[0] = global_pooling ? x_dims[2] : ksize[0];  // height
+  filter_val[0] =
+      global_pooling ? x_dims[2] : ksize[0];  // Entry 0: filter height
   NeuronModel_setOperandValue(
       model, filterH_node->index(), filter_val, sizeof(int32_t) * 1);
 
diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc
index 4b2a465cd6e48d9d387f0b2195b04728890601ca..177f778ea7dbfc77f389a76ed236a975a9cfe314 100644
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -64,12 +64,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   xType.dimensions = &dims_x[0];
   std::shared_ptr<Node> x_node = nullptr;
   if (graph->Has(x_name)) {
-    // input operand already exist
     x_node = graph->Get(x_name);
     VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index();
   } else {
-    // add input operand
-    NeuronModel_addOperand(model, &xType);  // 0: input
+    NeuronModel_addOperand(model, &xType);  // Operand 0: input
     x_node = graph->Add(x_name, dims_x);
   }
   VLOG(3) << "input_scale size: " << input_scale
@@ -80,7 +78,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   NeuronOperandType betaType;
   betaType.type = NEURON_FLOAT32;
   betaType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &betaType);  // 1: beta
+  NeuronModel_addOperand(model, &betaType);  // Operand 1: beta
   std::shared_ptr<Node> beta_node = nullptr;
   beta_node = graph->Add(x_name + "_beta", dims_int32);
 
@@ -88,7 +86,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   NeuronOperandType axisType;
   axisType.type = NEURON_INT32;
   axisType.dimensionCount = 0;
-  NeuronModel_addOperand(model, &axisType);  // 2: axis
+  NeuronModel_addOperand(model, &axisType);  // Operand 2: axis
   std::shared_ptr<Node> axis_node = nullptr;
   axis_node = graph->Add(x_name + "_axis", dims_int32);
 
@@ -99,7 +97,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   outType.zeroPoint = 128;
   outType.dimensionCount = x_dims.size();
   outType.dimensions = &dims_x[0];
-  NeuronModel_addOperand(model, &outType);  // 3: output
+  NeuronModel_addOperand(model, &outType);  // Operand 3: output
   std::shared_ptr<Node> out_node = nullptr;
   out_node = graph->Add(out_name, dims_x);
   VLOG(3) << "out_scale: " << out_scale;
@@ -112,8 +110,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   axis_val[0] = axis;
   NeuronModel_setOperandValue(
       model, axis_node->index(), axis_val, sizeof(int32_t) * 1);
-  std::vector<uint32_t> addInIndex = {
-      x_node->index(), beta_node->index(), axis_node->index()};
+  std::vector<uint32_t> addInIndex = {x_node->index(),      // 0: input
+                                      beta_node->index(),   // 1: beta
+                                      axis_node->index()};  // 2: axis
   std::vector<uint32_t> addOutIndex = {out_node->index()};
   int neuron_errCode = NeuronModel_addOperation(model,
                                                 NEURON_SOFTMAX,
diff --git a/lite/kernels/apu/bridges/utility.cc b/lite/kernels/apu/bridges/utility.cc
index c91e81476e519a28ebf851f42f2916c9d7c38dd8..f9cd04b71805bc29a7da4450d1f9235c5cf5d64a 100644
--- a/lite/kernels/apu/bridges/utility.cc
+++ b/lite/kernels/apu/bridges/utility.cc
@@ -39,22 +39,43 @@ bool HasInputArg(const OpInfo* op_info,
   }
 }
 
-void insert_transpose_node(void* ctx,
-                           const std::string& input_name,
-                           const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
-                           float scale,
-                           int32_t zeroPoint) {
+int insert_requant_node(void* ctx,
+                        const std::string& input_name,
+                        const std::string& output_name,
+                        std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> output_shape,
+                        float scale_in,
+                        float scale_out,
+                        int32_t zeroPoint) {
   int neuron_errCode;
   auto graph = static_cast<Graph*>(ctx);
   auto model = graph->model();
 
+  uint32_t numDevices = 0;
+  CHECK_EQ(Neuron_getDeviceCount(&numDevices), NEURON_NO_ERROR);
+  CHECK_GT(numDevices, (uint32_t)0);
+
+  NeuronDevice* targetDevice = nullptr;
+
+  for (uint32_t i = 0; i < numDevices; ++i) {
+    NeuronDevice* device = nullptr;
+    Neuron_getDevice(i, &device);
+    const char* name;
+    NeuronDevice_getName(device, &name);
+    if (0 == strcmp(name, "mtk-dsp")) {
+      targetDevice = device;
+      break;
+    }
+  }
+  if (targetDevice == nullptr) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+
   // Add input
   NeuronOperandType inType;
   inType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  inType.scale = scale;
+  inType.scale = scale_in;
   inType.zeroPoint = zeroPoint;
   inType.dimensionCount = input_shape.size();
   inType.dimensions = &input_shape[0];
@@ -64,15 +85,81 @@ void insert_transpose_node(void* ctx,
     VLOG(3) << "Has " << input_name;
     input_node = graph->Get(input_name);
   } else {
-    neuron_errCode = NeuronModel_addOperand(model, &inType);  // input
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
     if (NEURON_NO_ERROR != neuron_errCode) {
-      LOG(WARNING) << "Insert transpose op fail!";
-      return;
+      LOG(FATAL) << "Insert mtk_requant op fail!";
+      return -1;
     }
     VLOG(3) << "Add " << input_name;
     input_node = graph->Add(input_name, input_shape);
   }
 
+  // Add output
+  NeuronOperandType outType;
+  outType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  outType.scale = scale_out;
+  outType.zeroPoint = zeroPoint;
+  outType.dimensionCount = output_shape.size();
+  outType.dimensions = &output_shape[0];
+
+  NeuronModel_addOperand(model, &outType);
+  std::shared_ptr<Node> output_node = nullptr;
+  output_node = graph->Add(output_name, output_shape);
+
+  std::vector<uint32_t> addInIndex = {input_node->index()};
+
+  std::vector<uint32_t> addOutIndex = {output_node->index()};
+
+  neuron_errCode = NeuronModel_addOperationExtension(model,
+                                                     "MTK_REQUANTIZE",
+                                                     "mediatek",
+                                                     targetDevice,
+                                                     addInIndex.size(),
+                                                     &addInIndex[0],
+                                                     addOutIndex.size(),
+                                                     &addOutIndex[0]);
+  if (NEURON_NO_ERROR != neuron_errCode) {
+    LOG(FATAL) << "Insert mtk_requant op fail!";
+    return -1;
+  }
+
+  return 0;
+}
+
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint) {
+  int neuron_errCode;
+  auto graph = static_cast<Graph*>(ctx);
+  auto model = graph->model();
+
+  // Add input
+  NeuronOperandType inType;
+  inType.type = NEURON_TENSOR_QUANT8_ASYMM;
+  inType.scale = scale;
+  inType.zeroPoint = zeroPoint;
+  inType.dimensionCount = input_shape.size();
+  inType.dimensions = &input_shape[0];
+
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    VLOG(5) << "Has " << input_name;
+    input_node = graph->Get(input_name);
+  } else {
+    neuron_errCode = NeuronModel_addOperand(model, &inType);
+    if (NEURON_NO_ERROR != neuron_errCode) {
+      LOG(FATAL) << "Insert transpose op fail!";
+      return -1;
+    }
+    VLOG(5) << "Add " << input_name;
+    input_node = graph->Add(input_name, input_shape);
+  }
+
   // Add perm
   NeuronOperandType permsType;
   permsType.type = NEURON_TENSOR_INT32;
@@ -80,22 +167,22 @@ void insert_transpose_node(void* ctx,
   uint32_t dims_perms[1] = {4};
   permsType.dimensions = dims_perms;
 
-  neuron_errCode = NeuronModel_addOperand(model, &permsType);  // perm
+  neuron_errCode = NeuronModel_addOperand(model, &permsType);
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
-    return;
+    LOG(FATAL) << "Insert transpose op fail!";
+    return -1;
   }
   std::shared_ptr<Node> perms_node = nullptr;
   perms_node = graph->Add(input_name + "_perms", {4});
 
-  VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
+  VLOG(5) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":"
           << axis[3];
-  //  &axis[0], sizeof(int32_t) * axis.size());
+
   neuron_errCode = NeuronModel_setOperandValue(
       model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size());
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
-    return;
+    LOG(FATAL) << "Insert transpose op fail!";
+    return -1;
   }
 
   // Add output
@@ -106,7 +193,7 @@ void insert_transpose_node(void* ctx,
   outType.dimensionCount = output_shape.size();
   outType.dimensions = &output_shape[0];
 
-  NeuronModel_addOperand(model, &outType);  // output
+  NeuronModel_addOperand(model, &outType);
   std::shared_ptr<Node> output_node = nullptr;
   output_node = graph->Add(output_name, output_shape);
 
@@ -123,8 +210,10 @@ void insert_transpose_node(void* ctx,
                                             &addOutIndex[0]);
 
   if (NEURON_NO_ERROR != neuron_errCode) {
-    LOG(WARNING) << "Insert transpose op fail!";
+    LOG(FATAL) << "Insert transpose op fail!";
   }
+
+  return 0;
 }
 
 void transpose(const int8_t* input_data,
@@ -135,9 +224,9 @@ void transpose(const int8_t* input_data,
   int new_index = -1;
   int dim[4] = {0};
   std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
           << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
   for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
     for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
       for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -164,9 +253,9 @@ void transposeAsym(const int8_t* input_data,
   int new_index = -1;
   int dim[4] = {0};
   std::vector<uint32_t> shape = input_shape;
-  VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
+  VLOG(5) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2]
           << ":" << input_shape[3];
-  VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
+  VLOG(5) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3];
   for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
     for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
       for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
@@ -177,8 +266,8 @@ void transposeAsym(const int8_t* input_data,
               dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
               dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
               dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
-
-          output_data[new_index] = input_data[old_index] + 128;  // per layer
+          // Per layer op is asym op and need to add 128
+          output_data[new_index] = input_data[old_index] + 128;
         }
       }
     }
diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h
old mode 100644
new mode 100755
index 01752d181964bfb0e19f4319b52727b1ab541ee7..ff9c75711c22cebc15f8b0f3b14d11dc8e6c62f1
--- a/lite/kernels/apu/bridges/utility.h
+++ b/lite/kernels/apu/bridges/utility.h
@@ -33,14 +33,23 @@ bool HasInputArg(const OpInfo* op_info,
                  const Scope* scope,
                  const std::string& argname);
 
-void insert_transpose_node(void* ctx,
-                           const std::string& input_name,
-                           const std::string& output_name,
-                           std::vector<uint32_t> input_shape,
-                           std::vector<uint32_t> output_shape,
-                           std::vector<int32_t> axis,
-                           float scale,
-                           int32_t zeroPoint);
+int insert_requant_node(void* ctx,
+                        const std::string& input_name,
+                        const std::string& output_name,
+                        std::vector<uint32_t> input_shape,
+                        std::vector<uint32_t> output_shape,
+                        float scale_in,
+                        float scale_out,
+                        int32_t zeroPoint);
+
+int insert_transpose_node(void* ctx,
+                          const std::string& input_name,
+                          const std::string& output_name,
+                          std::vector<uint32_t> input_shape,
+                          std::vector<uint32_t> output_shape,
+                          std::vector<int32_t> axis,
+                          float scale,
+                          int32_t zeroPoint);
 
 void transpose(const int8_t* input_data,
                uint8_t* output_data,
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
old mode 100644
new mode 100755
index 698536743d3225aaf2ebd4e3a6a75ee3f3c1ef1f..5e86514478f421ece6642afdd0bfaab4025420bb
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -33,6 +33,14 @@ bool SubgraphEngine::BuildDeviceProgram() {
     BuildOriginProgram();
   }
 
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+
+  auto start_time = GetCurrentUS();
+
   unsigned int version;
   Neuron_getVersion(&version);
   VLOG(3) << "Neuron Adapter version: " << version;
@@ -108,18 +116,16 @@ bool SubgraphEngine::BuildDeviceProgram() {
   }
   VLOG(3) << "[APU] APU NIR model created!";
 
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
+  VLOG(1) << "[APU] APU NIR model created, Create cost "
+          << GetCurrentUS() - start_time << " us";
+
+  start_time = GetCurrentUS();
   compilation_ = lite::apu::Device::Global().Build(model_);
   if (compilation_ == nullptr) {
     LOG(WARNING) << "[APU] Build APU DLA model failed!";
     return false;
   }
-  VLOG(3) << "[APU] APU DLA model created, Build cost "
+  VLOG(1) << "[APU] APU DLA model created, Build cost "
           << GetCurrentUS() - start_time << " us";
   return true;
 }
@@ -176,7 +182,7 @@ bool SubgraphEngine::LaunchDeviceProgram() {
     }
   }
   NeuronExecution_free(run);
-  VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
+  VLOG(1) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
   return true;
 }
 
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 864f2938af6aefd57185a61831e067d56908a892..40cb03872da810d54ecede0f42b996f96fbfe422 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -68,6 +68,7 @@ add_kernel(sequence_conv_compute_arm ARM extra SRCS sequence_conv_compute.cc DEP
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_prod_compute_arm ARM extra SRCS reduce_prod_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_sum_compute_arm ARM extra SRCS reduce_sum_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(split_lod_tensor_compute_arm ARM extra SRCS split_lod_tensor_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(merge_lod_tensor_compute_arm ARM extra SRCS merge_lod_tensor_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -79,6 +80,9 @@ add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposal
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pixel_shuffle_compute_arm ARM extra SRCS pixel_shuffle_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(scatter_compute_arm ARM extra SRCS scatter_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_expand_as_compute_arm ARM extra SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
 
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index 54e67de5abbfc88f64a50b07335d2527d9738206..ba7837cfff312a15c9ec769ab4e8ac16d0945f4d 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -59,12 +59,6 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   bool flag_dw_3x3 = (kw == 3) && (kh == 3) && (stride == 1 || stride == 2);
   bool flag_dw_5x5 = (kw == 5) && (kh == 5) && (stride == 1 || stride == 2);
 
-#ifdef __aarch64__
-#else
-  bool flag =
-      (stride == 1 && (paddings[0] > 1 || paddings[2] > 1)) ? false : true;
-  flag_dw_3x3 = flag_dw_3x3 && flag;
-#endif
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   /// select conv impl
diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc
index 3558eb22fbd4863771bf2b6b2e62e51b75a1227e..c5b43a31a0f495f3635d389939acf44e979a3dc7 100644
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -28,11 +28,15 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   auto& ctx = this->ctx_->template As<ARMContext>();
   auto w_dims = param.filter->dims();
   auto kw = w_dims[3];
+  auto channel = w_dims[0];
+  auto hin = param.x->dims()[2];
+  auto win = param.x->dims()[3];
   auto paddings = *param.paddings;
+  bool ch_four = channel <= 4 * win;
   // select dw conv kernel
   if (kw == 3) {
     bool pads_less = ((paddings[1] < 2) && (paddings[3] < 2));
-    if (pads_less && paddings[0] == paddings[2] &&
+    if (ch_four && pads_less && paddings[0] == paddings[2] &&
         (paddings[0] == 0 || paddings[0] == 1)) {
       flag_trans_weights_ = false;
     } else {
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
index 6e3a620a4a8989807481cb0f56ac91643eda4ce7..7271eb9c16df79553469b2c29e00a272b2ff81d1 100644
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -26,6 +26,88 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
+template <typename Dtype>
+void naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
+  int k = 0;
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < m; ++j) {
+      dout[k++] = din[j * n + i];
+    }
+  }
+}
+
+template <PrecisionType PType>
+void fc_trans_weights(const Tensor& tin, Tensor* tout);
+
+template <>
+void fc_trans_weights<PRECISION(kFloat)>(const Tensor& tin, Tensor* tout) {
+  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
+  int m = tin.dims()[0];
+  int n = tin.dims()[1];
+  tout->Resize({n, m});
+  auto* ptr_in = tin.data<float>();
+  auto* ptr_out = tout->mutable_data<float>();
+  naive_transpose(ptr_in, ptr_out, m, n);
+}
+
+template <>
+void fc_trans_weights<PRECISION(kInt8)>(const Tensor& tin, Tensor* tout) {
+  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
+  int m = tin.dims()[0];
+  int n = tin.dims()[1];
+  tout->Resize({n, m});
+  auto* ptr_in = tin.data<int8_t>();
+  auto* ptr_out = tout->mutable_data<int8_t>();
+  naive_transpose(ptr_in, ptr_out, m, n);
+}
+
+template <PrecisionType PType, PrecisionType OutType>
+bool check_fc_use_gemm(int m, const std::vector<float>& scale, bool has_bias) {
+  return m > 1;
+}
+
+template <>
+bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kFloat)>(
+    int m, const std::vector<float>& scale, bool has_bias) {
+  CHECK_GT(scale.size(), 0) << "Int8 FC param must has weight_scale";
+  return m > 1 && scale.size() == 1;
+}
+
+template <>
+bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kInt8)>(
+    int m, const std::vector<float>& scale, bool has_bias) {
+  CHECK_GT(scale.size(), 0) << "Int8 FC param must has weight_scale";
+  return m > 1 && scale.size() == 1 && !has_bias;
+}
+
+template <PrecisionType PType, PrecisionType OutType>
+void FcCompute<PType, OutType>::ReInitWhenNeeded() {
+  auto& param = this->template Param<operators::FcParam>();
+  auto x_dims = param.input->dims();
+  if (last_shape_ == x_dims) {
+    return;
+  }
+  last_shape_ = x_dims;
+  auto w_dims = param.w->dims();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  CHECK_GE(x_dims.size(), 2UL);
+  CHECK_EQ(w_dims.size(), 2UL);
+  CHECK_GE(param.output->dims().size(), 2UL);
+
+  m_ = x_dims.Slice(0, param.in_num_col_dims).production();
+  k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
+  CHECK_EQ(k_, w_dims[0]);
+  n_ = w_dims[1];
+  CHECK_EQ(k_, static_cast<int>(w_dims[0]));
+  flag_gemm_ = check_fc_use_gemm<PType, OutType>(
+      m_, param.weight_scale, param.bias != nullptr);
+  if (!flag_trans_weights_ && !flag_gemm_) {
+    flag_trans_weights_ = true;
+    fc_trans_weights<PType>(*param.w, &weights_);
+  }
+}
+
 ///  for fp32 kernel
 template <>
 void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
@@ -71,8 +153,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   /// update bias
   if (param.bias) {
     bias_.Resize(param.bias->dims());
-    auto ptr = bias_.mutable_data<float>();
-    auto ptr_in = bias_.data<float>();
+    auto* ptr = bias_.mutable_data<float>();
+    auto* ptr_in = bias_.data<float>();
     float out_scale = param.output_scale;
     for (int i = 0; i < bias_.numel(); ++i) {
       ptr[i] = ptr_in[i] / out_scale;
@@ -86,9 +168,9 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   auto& param = this->Param<operators::FcParam>();
   auto& ctx = this->ctx_->template As<ARMContext>();
 
-  auto i_data = param.input->data<float>();
-  auto o_data = param.output->mutable_data<float>();
-  auto w_data = param.w->data<float>();
+  auto* i_data = param.input->data<float>();
+  auto* o_data = param.output->mutable_data<float>();
+  auto* w_data = flag_gemm_ ? param.w->data<float>() : weights_.data<float>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -125,8 +207,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
     }
   } else {
     for (int i = 0; i < m_; ++i) {
-      auto i_data_batch = i_data + i * k_;
-      auto o_data_batch = o_data + i * n_;
+      auto* i_data_batch = i_data + i * k_;
+      auto* o_data_batch = o_data + i * n_;
       lite::arm::math::sgemv(w_data,
                              i_data_batch,
                              o_data_batch,
@@ -147,9 +229,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
   auto& param = this->Param<operators::FcParam>();
   auto& ctx = this->ctx_->template As<ARMContext>();
 
-  auto i_data = param.input->data<int8_t>();
-  auto o_data = param.output->mutable_data<float>();
-  auto w_data = param.w->data<int8_t>();
+  auto* i_data = param.input->data<int8_t>();
+  auto* o_data = param.output->mutable_data<float>();
+  auto* w_data =
+      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -182,8 +265,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kFloat)>::Run() {
     }
   } else {
     for (int i = 0; i < m_; ++i) {
-      auto i_data_batch = i_data + i * k_;
-      auto o_data_batch = o_data + i * n_;
+      auto* i_data_batch = i_data + i * k_;
+      auto* o_data_batch = o_data + i * n_;
       lite::arm::math::gemv_int8(w_data,
                                  i_data_batch,
                                  o_data_batch,
@@ -205,9 +288,10 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
   auto& param = this->Param<operators::FcParam>();
   auto& ctx = this->ctx_->template As<ARMContext>();
 
-  auto i_data = param.input->data<int8_t>();
-  auto o_data = param.output->mutable_data<int8_t>();
-  auto w_data = param.w->data<int8_t>();
+  auto* i_data = param.input->data<int8_t>();
+  auto* o_data = param.output->mutable_data<int8_t>();
+  auto* w_data =
+      flag_trans_weights_ ? weights_.data<int8_t>() : param.w->data<int8_t>();
   const float* b_data = param.bias ? param.bias->data<float>() : nullptr;
   if (flag_trans_bias_) {
     b_data = bias_.data<float>();
@@ -240,8 +324,8 @@ void FcCompute<PRECISION(kInt8), PRECISION(kInt8)>::Run() {
                              &ctx);
   } else {
     for (int i = 0; i < m_; ++i) {
-      auto i_data_batch = i_data + i * k_;
-      auto o_data_batch = o_data + i * n_;
+      auto* i_data_batch = i_data + i * k_;
+      auto* o_data_batch = o_data + i * n_;
       lite::arm::math::gemv_int8(w_data,
                                  i_data_batch,
                                  o_data_batch,
diff --git a/lite/kernels/arm/fc_compute.h b/lite/kernels/arm/fc_compute.h
index c5e86f94e8f38bed5d4e1d91d3cb31cacec9aeb1..949e8bd7c0aef4d15379abb22adb0f1f3392463f 100644
--- a/lite/kernels/arm/fc_compute.h
+++ b/lite/kernels/arm/fc_compute.h
@@ -24,92 +24,12 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename Dtype>
-void naive_transpose(const Dtype* din, Dtype* dout, int m, int n) {
-  int k = 0;
-  for (int i = 0; i < n; ++i) {
-    for (int j = 0; j < m; ++j) {
-      dout[k++] = din[j * n + i];
-    }
-  }
-}
-
-template <PrecisionType PType>
-void fc_trans_weights(const Tensor& tin, Tensor* tout);
-
-template <>
-void fc_trans_weights<PRECISION(kFloat)>(const Tensor& tin, Tensor* tout) {
-  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
-  int m = tin.dims()[0];
-  int n = tin.dims()[1];
-  tout->Resize({n, m});
-  auto ptr_in = tin.data<float>();
-  auto ptr_out = tout->mutable_data<float>();
-  naive_transpose(ptr_in, ptr_out, m, n);
-}
-
-template <>
-void fc_trans_weights<PRECISION(kInt8)>(const Tensor& tin, Tensor* tout) {
-  CHECK_EQ(tin.dims().size(), 2) << "fc weights size must = 2";
-  int m = tin.dims()[0];
-  int n = tin.dims()[1];
-  tout->Resize({n, m});
-  auto ptr_in = tin.data<int8_t>();
-  auto ptr_out = tout->mutable_data<int8_t>();
-  naive_transpose(ptr_in, ptr_out, m, n);
-}
-
-template <PrecisionType PType, PrecisionType OutType>
-bool check_fc_use_gemm(int m, const std::vector<float>& scale, bool has_bias) {
-  return m > 1;
-}
-
-template <>
-bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kFloat)>(
-    int m, const std::vector<float>& scale, bool has_bias) {
-  CHECK(scale.size() > 0) << "Int8 FC param must has weight_scale";
-  return m > 1 && scale.size() == 1;
-}
-
-template <>
-bool check_fc_use_gemm<PRECISION(kInt8), PRECISION(kInt8)>(
-    int m, const std::vector<float>& scale, bool has_bias) {
-  CHECK(scale.size() > 0) << "Int8 FC param must has weight_scale";
-  return m > 1 && scale.size() == 1 && !has_bias;
-}
-
 template <PrecisionType PType, PrecisionType OutType>
 class FcCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   using param_t = operators::FcParam;
 
-  virtual void ReInitWhenNeeded() {
-    auto& param = this->template Param<operators::FcParam>();
-    auto x_dims = param.input->dims();
-    if (last_shape_ == x_dims) {
-      return;
-    }
-    last_shape_ = x_dims;
-    auto w_dims = param.w_dims;
-    auto& ctx = this->ctx_->template As<ARMContext>();
-
-    CHECK_GE(x_dims.size(), 2UL);
-    CHECK_EQ(w_dims.size(), 2UL);
-    CHECK_GE(param.output->dims().size(), 2UL);
-
-    m_ = x_dims.Slice(0, param.in_num_col_dims).production();
-    k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
-    n_ = w_dims[1];
-    flag_gemm_ = check_fc_use_gemm<PType, OutType>(
-        m_, param.weight_scale, param.bias != nullptr);
-    if (flag_trans_weights_ == flag_gemm_) {
-      flag_trans_weights_ = !flag_trans_weights_;
-      Tensor tmp_tensor;
-      fc_trans_weights<PType>(*param.w, &tmp_tensor);
-      param.w->CopyDataFrom(tmp_tensor);
-    }
-  }
-
+  virtual void ReInitWhenNeeded();
   virtual void PrepareForRun();
   virtual void Run();
 
@@ -117,6 +37,7 @@ class FcCompute : public KernelLite<TARGET(kARM), PType> {
 
  private:
   DDim last_shape_;
+  Tensor weights_;
   Tensor bias_;
   bool flag_trans_weights_{false};
   bool flag_trans_bias_{false};
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index f5a87e5431955252e47143252ce13ba4056c4a7f..84e1b5dd5c7268337a5c0d50b53d209ecfbc73f2 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -73,10 +73,10 @@ void GatherCompute<IndexType>::Run() {
 
 REGISTER_LITE_KERNEL(gather,
                      kARM,
-                     kAny,
+                     kFloat,
                      kNCHW,
                      paddle::lite::kernels::arm::GatherCompute<int32_t>,
-                     def)
+                     int32)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
@@ -85,10 +85,10 @@ REGISTER_LITE_KERNEL(gather,
 
 REGISTER_LITE_KERNEL(gather,
                      kARM,
-                     kAny,
+                     kFloat,
                      kNCHW,
                      paddle::lite::kernels::arm::GatherCompute<int64_t>,
-                     def_int64_idx)
+                     int64)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindInput("Index",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
diff --git a/lite/kernels/arm/gather_compute.h b/lite/kernels/arm/gather_compute.h
index 0226e5f68eee3f23dbd945af6f4f455ab79190c5..fc68a982bee3357635bfd40bd83589bd1846a747 100644
--- a/lite/kernels/arm/gather_compute.h
+++ b/lite/kernels/arm/gather_compute.h
@@ -24,7 +24,7 @@ namespace kernels {
 namespace arm {
 
 template <typename IndexType>
-class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
+class GatherCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   void Run() override;
 
diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc
index 760b2fcf0630a632d1f1bbaeda7760d2de25a7a4..8593758d5af6ea7d5badc6870ea51e13a443ed99 100644
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
@@ -35,6 +35,7 @@ void BilinearInterpCompute::Run() {
   int out_w = param.out_w;
   int out_h = param.out_h;
   bool align_corners = param.align_corners;
+  bool align_mode = param.align_mode;
   std::string interp_method = "Bilinear";
   lite::arm::math::interpolate(X,
                                OutSize,
@@ -45,6 +46,7 @@ void BilinearInterpCompute::Run() {
                                out_w,
                                scale,
                                align_corners,
+                               align_mode,
                                interp_method);
 }
 
@@ -59,6 +61,7 @@ void NearestInterpCompute::Run() {
   int out_w = param.out_w;
   int out_h = param.out_h;
   bool align_corners = param.align_corners;
+  bool align_mode = param.align_mode;
   std::string interp_method = "Nearest";
   lite::arm::math::interpolate(X,
                                OutSize,
@@ -69,6 +72,7 @@ void NearestInterpCompute::Run() {
                                out_w,
                                scale,
                                align_corners,
+                               align_mode,
                                interp_method);
 }
 
diff --git a/lite/kernels/arm/reduce_sum_compute.cc b/lite/kernels/arm/reduce_sum_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..261ed2b6a3f7ab0ea794f8e98392594afe0ad16c
--- /dev/null
+++ b/lite/kernels/arm/reduce_sum_compute.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/reduce_sum_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ReduceSumCompute::Run() {
+  auto& param = this->template Param<operators::ReduceParam>();
+  auto* input = param.x->template data<float>();
+  auto x_dims = param.x->dims();
+  int x_rank = x_dims.size();
+  auto* output = param.output->template mutable_data<float>();
+  std::vector<int> dim = param.dim;
+  bool keep_dim = param.keep_dim;
+  bool reduce_all = param.reduce_all;
+
+  if (!dim.empty()) {
+    for (int i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  if (reduce_all) {
+    lite::arm::math::reduce_sum_all(input, output, x_dims.production());
+  } else {
+    int n_in = 1;
+    int c_in = 1;
+    int h_in = 1;
+    int w_in = 1;
+    switch (x_dims.size()) {
+      case 4:
+        w_in = x_dims[3];
+      case 3:
+        h_in = x_dims[2];
+      case 2:
+        c_in = x_dims[1];
+      case 1:
+        n_in = x_dims[0];
+        break;
+      default:
+        LOG(FATAL) << "x_dims.size is " << x_dims.size()
+                   << ", which should not be over than 4.";
+    }
+
+    if (dim.size() == 1) {
+      switch (dim[0]) {
+        case 0:
+          lite::arm::math::reduce_sum_n(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 1:
+          lite::arm::math::reduce_sum_c(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 2:
+          lite::arm::math::reduce_sum_h(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 3:
+          lite::arm::math::reduce_sum_w(input, output, n_in, c_in, h_in, w_in);
+          break;
+        default:
+          LOG(FATAL) << "dim[0] is " << dim[0]
+                     << ", which should be less than 4.";
+      }
+    } else if (dim.size() == 2) {
+      if (dim[0] == 0 && dim[1] == 1) {
+        lite::arm::math::reduce_sum_nc(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 1 && dim[1] == 2) {
+        lite::arm::math::reduce_sum_ch(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 2 && dim[1] == 3) {
+        lite::arm::math::reduce_sum_hw(input, output, n_in, c_in, h_in, w_in);
+      } else {
+        LOG(FATAL)
+            << "Only support the values of the dim are 0,1 1,2 or 2,3 for now.";
+      }
+    } else {
+      LOG(FATAL) << "dim's size: " << dim.size()
+                 << " over than 2, which is not supported now!!";
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reduce_sum,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ReduceSumCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/arm/reduce_sum_compute.h b/lite/kernels/arm/reduce_sum_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..15dcc90b6474220fa7193967f14542bb102ef7a3
--- /dev/null
+++ b/lite/kernels/arm/reduce_sum_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ReduceSumCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceSumCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/scatter_compute.cc b/lite/kernels/arm/scatter_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8d3a512975c26d356405deb8ae9ff58093507425
--- /dev/null
+++ b/lite/kernels/arm/scatter_compute.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/scatter_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ScatterCompute::Run() {
+  auto& param = this->template Param<operators::ScatterParam>();
+  const float* updates_data = param.updates->template data<float>();
+  const int64_t* indexs_data = param.indexs->template data<int64_t>();
+  float* output_data = param.output->template mutable_data<float>();
+  bool overwrite = param.overwrite;
+  int index_size = param.indexs->dims()[0];
+  auto in_dims = param.x->dims();
+  int num = 1;
+  for (int i = 1; i < in_dims.size(); i++) {
+    num *= in_dims[i];
+  }
+  lite::arm::math::scatter(indexs_data,
+                           updates_data,
+                           output_data,
+                           index_size,
+                           in_dims[0],
+                           num,
+                           overwrite);
+  if (!param.x->lod().empty()) {
+    param.output->set_lod(param.x->lod());
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(scatter,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ScatterCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Updates",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/arm/scatter_compute.h b/lite/kernels/arm/scatter_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ee37cf55dd3e9f81582ffdcc5bdf96fa8cc25a8
--- /dev/null
+++ b/lite/kernels/arm/scatter_compute.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class ScatterCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~ScatterCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/sequence_expand_as_compute.cc b/lite/kernels/arm/sequence_expand_as_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0db8d6e4c086fa843965f66b5411f47ebebf5dc4
--- /dev/null
+++ b/lite/kernels/arm/sequence_expand_as_compute.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/sequence_expand_as_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void SequenceExpandAsCompute::Run() {
+  auto& param = Param<operators::SequenceExpandAsParam>();
+  auto* x = param.x;
+  auto* y = param.y;
+  auto* out = param.out;
+  auto x_lod = x->lod();
+  auto y_lod = y->lod();
+  auto dims = x->dims();
+  auto out_data = out->mutable_data<float>();
+  auto x_data = x->data<float>();
+  int seq_size = x->numel() / dims[0];
+
+  std::vector<uint64_t> out_lod;
+  out_lod.push_back(0);
+  int sum = 0;
+  for (int i = 0; i < y_lod[0].size(); i++) {
+    int repeat_num = y_lod[0][i];
+    if (repeat_num == 0) {
+      continue;
+    } else {
+      for (int j = 0; j < repeat_num; j++) {
+        memcpy(out_data, x_data, sizeof(float) * seq_size);
+        out_data += seq_size;
+      }
+      x_data += seq_size;
+    }
+    sum += repeat_num;
+    out_lod.push_back(sum);
+  }
+  std::vector<std::vector<uint64_t>> lod;
+  lod.push_back(out_lod);
+  out->set_lod(lod);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_expand_as,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SequenceExpandAsCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/sequence_expand_as_compute.h b/lite/kernels/arm/sequence_expand_as_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd0f2462ffa487b8e036dd44bd7a9d747671c3c9
--- /dev/null
+++ b/lite/kernels/arm/sequence_expand_as_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SequenceExpandAsCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  void Run() override;
+
+  virtual ~SequenceExpandAsCompute() = default;
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/bm/bridges/box_coder_op.cc b/lite/kernels/bm/bridges/box_coder_op.cc
index 9ef1824a6460ac6fd2bdfdceea531ce1a9d806f0..999ea4dca2dfc51426994b8ec96c237bec8f777f 100644
--- a/lite/kernels/bm/bridges/box_coder_op.cc
+++ b/lite/kernels/bm/bridges/box_coder_op.cc
@@ -73,10 +73,16 @@ int BoxCoderConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (op_info->HasAttr("variance")) {
     variance = op_info->GetAttr<std::vector<float>>("variance");
   }
+  int variance_len = variance.size();
   user_cpu_param_t bm_param;
   bm_param.op_type = USER_PADDLE_BOX_CODER;
   bm_param.u.box_coder_param.axis = axis;
-  bm_param.u.box_coder_param.variance = &variance[0];
+  CHECK_LE(variance_len, 2000);
+  memset(bm_param.u.box_coder_param.variance, 0, 2000 * sizeof(float));
+  memcpy(bm_param.u.box_coder_param.variance,
+         &variance[0],
+         variance_len * sizeof(float));
+  bm_param.u.box_coder_param.variance_len = variance_len;
   bm_param.u.box_coder_param.code_type =
       (code_type == "encode_center_size") ? 0 : 1;
   bm_param.u.box_coder_param.normalized = box_normalized;
diff --git a/lite/kernels/bm/bridges/cast_op.cc b/lite/kernels/bm/bridges/cast_op.cc
index 42c0751b9278b1fcecb4e4c6032e046c1fad5461..45cc90c2016f901536110d32322eeb62eced537b 100644
--- a/lite/kernels/bm/bridges/cast_op.cc
+++ b/lite/kernels/bm/bridges/cast_op.cc
@@ -32,7 +32,8 @@ bool CvtDtype(int dtype, int* ptype) {
       *ptype = DTYPE_INT16;
       break;
     case 2:
-      *ptype = DTYPE_FP32;
+    case 3:
+      *ptype = DTYPE_INT32;
       break;
     case 5:
       *ptype = DTYPE_FP32;
diff --git a/lite/kernels/bm/bridges/elementwise_ops.cc b/lite/kernels/bm/bridges/elementwise_ops.cc
index 715874d418871076a0070a8333ea4348de881aff..9124821b6edb6b7263743a004e2c923ce48994d8 100644
--- a/lite/kernels/bm/bridges/elementwise_ops.cc
+++ b/lite/kernels/bm/bridges/elementwise_ops.cc
@@ -127,7 +127,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const float* y_data = const_cast<const float*>(y->mutable_data<float>());
   const float* x_data = const_cast<const float*>(x->mutable_data<float>());
   auto unique_op_name = lite::subgraph::bm::UniqueName("expand_ndims");
-  std::vector<int32_t> i_expand_shape_data(3);
+  std::vector<int32_t> i_expand_shape_data;
   if (x_is_const && y_is_const) {
     float* cpu_data = compute_elementwise_both_const(op);
     bm_add_const_tensor(graph->GetCompilerHandle(),
@@ -157,12 +157,31 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
             static_cast<const char*>(unique_op_name.c_str()));
         name[1] = static_cast<const char*>(unique_op_name.c_str());
         dim[1] = 3;
-        i_expand_shape_data[0] = i_y_shape_data[0];
-        i_expand_shape_data[1] = 1;
-        i_expand_shape_data[2] = 1;
+        i_expand_shape_data.push_back(i_y_shape_data[0]);
+        i_expand_shape_data.push_back(1);
+        i_expand_shape_data.push_back(1);
         shape[1] = &i_expand_shape_data[0];
         y_data = nullptr;
       }
+    } else {
+      if (dim[1] < dim[0]) {
+        for (size_t i = 0; i < dim[1]; i++) {
+          i_expand_shape_data.push_back(i_y_shape_data[i]);
+        }
+        for (size_t i = dim[1]; i < dim[0]; i++) {
+          i_expand_shape_data.push_back(1);
+        }
+        add_reshape_layer_v2(graph->GetCompilerHandle(),
+                             name[1],
+                             shape[1],
+                             dim[1],
+                             static_cast<const char*>(unique_op_name.c_str()),
+                             const_cast<const int*>(&i_expand_shape_data[0]),
+                             i_expand_shape_data.size());
+        dim[1] = dim[0];
+        shape[1] = &i_expand_shape_data[0];
+        name[1] = static_cast<const char*>(unique_op_name.c_str());
+      }
     }
     add_binary_layer_v2(graph->GetCompilerHandle(),
                         name[0],
diff --git a/lite/kernels/bm/bridges/multiclass_nms_op.cc b/lite/kernels/bm/bridges/multiclass_nms_op.cc
index fb7d656dd26408a2fe37a9a3ddba1521e3a0bc0d..6270dc9a3072b61895cf587db7d96e5feb329a9e 100644
--- a/lite/kernels/bm/bridges/multiclass_nms_op.cc
+++ b/lite/kernels/bm/bridges/multiclass_nms_op.cc
@@ -51,7 +51,7 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto score_threshold = op_info->GetAttr<float>("score_threshold");
   auto nms_threshold = op_info->GetAttr<float>("nms_threshold");
   auto nms_eta = op_info->GetAttr<float>("nms_eta");
-  bool normalized;
+  bool normalized = false;
   if (op_info->HasAttr("normalized")) {
     normalized = op_info->GetAttr<bool>("normalized");
   }
@@ -97,12 +97,39 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   in_dim[1] = score_dims.size();
   in_name[0] = static_cast<const char*>(boxes_var_name.c_str());
   in_name[1] = static_cast<const char*>(score_var_name.c_str());
-  int32_t* out_shape[1];
-  int32_t out_dim[1];
-  const char* out_name[1];
+  int32_t* out_shape[2];
+  int32_t out_dim[2];
+  const char* out_name[2];
   out_shape[0] = &i_out_shape_data[0];
   out_dim[0] = out_dims.size();
   out_name[0] = static_cast<const char*>(out_var_name.c_str());
+
+  std::vector<int64_t> vec_index_dim(score_dims.size());
+  std::vector<int32_t> i_out_index_shape_data(score_dims.size());
+  std::string out_index_name = "";
+  if (op_type == "multiclass_nms2") {
+    output_num = 2;
+    out_index_name = op_info->Output("Index").front();
+    auto out_index = scope->FindVar(out_index_name)->GetMutable<lite::Tensor>();
+    if (3 == score_dims.size()) {
+      vec_index_dim[0] = score_dims[0];
+      vec_index_dim[1] = keep_top_k;
+      vec_index_dim[2] = 1;
+    } else {
+      vec_index_dim[0] = keep_top_k;
+      vec_index_dim[1] = 1;
+    }
+    DDimLite index_dims(vec_index_dim);
+    out_index->Resize(index_dims);
+    out_index->mutable_data<float>();
+    for (size_t i = 0; i < index_dims.size(); i++) {
+      i_out_index_shape_data[i] = static_cast<int32_t>(index_dims[i]);
+    }
+    out_shape[1] = &i_out_index_shape_data[0];
+    out_dim[1] = index_dims.size();
+    out_name[1] = static_cast<const char*>(out_index_name.c_str());
+  }
+
   add_user_cpu_layer(graph->GetCompilerHandle(),
                      input_num,
                      in_shape,
@@ -126,3 +153,6 @@ int MultiClassNMSConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(multiclass_nms,
                          kBM,
                          paddle::lite::subgraph::bm::MultiClassNMSConverter);
+REGISTER_SUBGRAPH_BRIDGE(multiclass_nms2,
+                         kBM,
+                         paddle::lite::subgraph::bm::MultiClassNMSConverter);
diff --git a/lite/kernels/bm/bridges/paddle_use_bridges.h b/lite/kernels/bm/bridges/paddle_use_bridges.h
index b9b575c6dfb884e3962696dad15f994a9cb8d2e2..1891e13e432688ef5acd3b0b3aa3b174ddc01f46 100644
--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -39,6 +39,7 @@ USE_SUBGRAPH_BRIDGE(norm, kBM);
 USE_SUBGRAPH_BRIDGE(prior_box, kBM);
 USE_SUBGRAPH_BRIDGE(box_coder, kBM);
 USE_SUBGRAPH_BRIDGE(multiclass_nms, kBM);
+USE_SUBGRAPH_BRIDGE(multiclass_nms2, kBM);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kBM);
 USE_SUBGRAPH_BRIDGE(bilinear_interp, kBM);
 USE_SUBGRAPH_BRIDGE(yolo_box, kBM);
diff --git a/lite/kernels/bm/bridges/yolo_box_op.cc b/lite/kernels/bm/bridges/yolo_box_op.cc
index a5ea07f5fdece51d623f26a87cc7f7d4b727d58e..c1f8fa100f65d3665479747f04a841c7ab642d3e 100644
--- a/lite/kernels/bm/bridges/yolo_box_op.cc
+++ b/lite/kernels/bm/bridges/yolo_box_op.cc
@@ -67,17 +67,17 @@ int YoloBoxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto downsample_ratio = op_info->GetAttr<int>("downsample_ratio");
   auto conf_thresh = op_info->GetAttr<float>("conf_thresh");
   auto anchors = op_info->GetAttr<std::vector<int>>("anchors");
-  int* anchors_buffer = static_cast<int*>(malloc(sizeof(int) * anchors.size()));
-  CHECK(anchors_buffer != nullptr);
-  memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size());
+  CHECK_LE(anchors.size(), 2000);
   user_cpu_param_t bm_param;
   bm_param.op_type = USER_PADDLE_YOLO_BOX;
   bm_param.u.yolo_box_param.class_num = class_num;
   bm_param.u.yolo_box_param.downsample_ratio = downsample_ratio;
   bm_param.u.yolo_box_param.conf_thresh = conf_thresh;
-  bm_param.u.yolo_box_param.anchors = anchors_buffer;
+  memset(bm_param.u.yolo_box_param.anchors, 0, 2000 * sizeof(int));
+  memcpy(bm_param.u.yolo_box_param.anchors,
+         &anchors[0],
+         anchors.size() * sizeof(int));
   bm_param.u.yolo_box_param.anchors_size = anchors.size();
-  memcpy(anchors_buffer, &anchors[0], sizeof(int) * anchors.size());
   int32_t input_num = 2;
   int32_t output_num = 2;
   int32_t* in_shape[2];
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index efbb848313a008484187202c876c90ddab80e5b3..eeb81ba9daa020083d519d5806af65ff74ad8a86 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -66,9 +66,9 @@ bool SubgraphEngine::BuildDeviceProgram() {
       graph.GetCompilerHandle(), const_cast<char*>(unique_net_name.c_str()), 1);
   void* bmodel_data = nullptr;
   unsigned int data_size = 0;
-  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
   finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size);
   graph.UnlockCompilerMutex();
+  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
   bmrt_hd_ = bmrt_create(bm_hd_);
   if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
     return false;
@@ -79,15 +79,15 @@ bool SubgraphEngine::BuildDeviceProgram() {
   // input
   device_inputs_.resize(input_names_.size());
   for (size_t i = 0; i < input_names_.size(); i++) {
-    origin_itensors_[i] =
+    auto origin_itensor =
         exec_scope_->FindMutableTensor(net_info_->input_names[i]);
-    CHECK(origin_itensors_[i]);
+    CHECK(origin_itensor);
     bm_device_mem_t* p_mem =
         static_cast<bm_device_mem_t*>(malloc(sizeof(bm_device_mem_t)));
     CHECK(p_mem != nullptr);
-    CHECK_EQ(bm_malloc_device_byte(
-                 bm_hd_, p_mem, origin_itensors_[i]->memory_size()),
-             BM_SUCCESS);
+    CHECK_EQ(
+        bm_malloc_device_byte(bm_hd_, p_mem, origin_itensor->memory_size()),
+        BM_SUCCESS);
     bmrt_tensor_with_device(&device_inputs_[i],
                             *p_mem,
                             net_info_->input_dtypes[i],
@@ -124,9 +124,11 @@ bool SubgraphEngine::BuildDeviceProgram() {
 
 bool SubgraphEngine::LaunchDeviceProgram() {
   for (size_t i = 0; i < device_inputs_.size(); i++) {
+    auto origin_itensor =
+        exec_scope_->FindMutableTensor(net_info_->input_names[i]);
     bm_memcpy_s2d(bm_hd_,
                   device_inputs_[i].device_mem,
-                  const_cast<void*>(origin_itensors_[i]->raw_data()));
+                  const_cast<void*>(origin_itensor->raw_data()));
   }
   bmrt_launch_tensor_ex(bmrt_hd_,
                         net_names_[0],
diff --git a/lite/kernels/host/compare_compute.cc b/lite/kernels/host/compare_compute.cc
index b45cdc789ba18c6c5abb08dce73bce83990ee5ca..242c6c83d027a0ba8c8c7c8d6f028550f77af752 100644
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
@@ -230,6 +230,21 @@ REGISTER_LITE_KERNEL(greater_than, kHost, kFloat, kAny, greater_than_float, def)
                     TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
     .Finalize();
 
+using greater_than_int64 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt64),
+    paddle::lite::kernels::host::_GreaterThanFunctor<int64_t>>;
+REGISTER_LITE_KERNEL(greater_than, kHost, kInt64, kAny, greater_than_int64, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
 using greater_equal_float = paddle::lite::kernels::host::CompareCompute<
     PRECISION(kFloat),
     paddle::lite::kernels::host::_GreaterEqualFunctor<float>>;
@@ -245,3 +260,19 @@ REGISTER_LITE_KERNEL(
                 {LiteType::GetTensorTy(
                     TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
     .Finalize();
+
+using greater_equal_int64 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt64),
+    paddle::lite::kernels::host::_GreaterEqualFunctor<int64_t>>;
+REGISTER_LITE_KERNEL(
+    greater_equal, kHost, kInt64, kAny, greater_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
diff --git a/lite/kernels/host/crf_decoding_compute.h b/lite/kernels/host/crf_decoding_compute.h
index dd0cb8500099727a5706b016c285f090cbb4842b..8ddb9463ed1a8b40795ff0ef9782c83317505a84 100644
--- a/lite/kernels/host/crf_decoding_compute.h
+++ b/lite/kernels/host/crf_decoding_compute.h
@@ -52,7 +52,7 @@ void Decode(const Tensor& emission_weights,
 
   for (int k = 1; k < seq_len; ++k) {
     for (int i = 0; i < tag_num; ++i) {
-      T max_score = -std::numeric_limits<T>::max();
+      T max_score = -(std::numeric_limits<T>::max)();
       int max_j = 0;
       for (size_t j = 0; j < tag_num; ++j) {
         T score = alpha_value[(k - 1) * tag_num + j] +
@@ -67,7 +67,7 @@ void Decode(const Tensor& emission_weights,
     }
   }
 
-  T max_score = -std::numeric_limits<T>::max();
+  T max_score = -(std::numeric_limits<T>::max)();
   int max_i = 0;
   for (size_t i = 0; i < tag_num; ++i) {
     T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
diff --git a/lite/kernels/host/multiclass_nms_compute.cc b/lite/kernels/host/multiclass_nms_compute.cc
index 5a09fca72b4bb30ac67b1186cf90c58a5f9a1dd4..414ca978dda878e4cded4f565d372c663047ea96 100644
--- a/lite/kernels/host/multiclass_nms_compute.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -72,10 +72,10 @@ static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
       box2[3] < box1[1]) {
     return static_cast<T>(0.);
   } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_xmin = (std::max)(box1[0], box2[0]);
+    const T inter_ymin = (std::max)(box1[1], box2[1]);
+    const T inter_xmax = (std::min)(box1[2], box2[2]);
+    const T inter_ymax = (std::min)(box1[3], box2[3]);
     T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
     T inter_w = inter_xmax - inter_xmin + norm;
     T inter_h = inter_ymax - inter_ymin + norm;
diff --git a/lite/kernels/host/print_compute.cc b/lite/kernels/host/print_compute.cc
index 00c8ab7b13597ad33b9fafc878cd553572462a99..969fbb2d86795da66478eb44671d713fdb7836c1 100644
--- a/lite/kernels/host/print_compute.cc
+++ b/lite/kernels/host/print_compute.cc
@@ -128,7 +128,7 @@ class TensorFormatter {
   void FormatData(const Tensor& print_tensor, std::stringstream& log_stream) {
     int64_t print_size = summarize_ == -1
                              ? print_tensor.numel()
-                             : std::min(summarize_, print_tensor.numel());
+                             : (std::min)(summarize_, print_tensor.numel());
     const T* data = print_tensor.data<T>();  // Always kHost, so unnessary to
                                              // copy the data from device
     log_stream << "  - data: [";
diff --git a/lite/kernels/host/retinanet_detection_output_compute.cc b/lite/kernels/host/retinanet_detection_output_compute.cc
index 95a4bf708e7f03aee9d9ac99323b173287260b13..f92bea7bda351b6502b19a339002f81d87068c2e 100644
--- a/lite/kernels/host/retinanet_detection_output_compute.cc
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
@@ -83,10 +83,10 @@ static inline T JaccardOverlap(const std::vector<T>& box1,
       box2[3] < box1[1]) {
     return static_cast<T>(0.);
   } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_xmin = (std::max)(box1[0], box2[0]);
+    const T inter_ymin = (std::max)(box1[1], box2[1]);
+    const T inter_xmax = (std::min)(box1[2], box2[2]);
+    const T inter_ymax = (std::min)(box1[3], box2[3]);
     T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
     T inter_w = inter_xmax - inter_xmin + norm;
     T inter_h = inter_ymax - inter_ymin + norm;
@@ -183,10 +183,10 @@ void DeltaScoreToPrediction(
     pred_box_xmax = pred_box_xmax / im_scale;
     pred_box_ymax = pred_box_ymax / im_scale;
 
-    pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
-    pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
-    pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
-    pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
+    pred_box_xmin = (std::max)((std::min)(pred_box_xmin, im_width - 1), zero);
+    pred_box_ymin = (std::max)((std::min)(pred_box_ymin, im_height - 1), zero);
+    pred_box_xmax = (std::max)((std::min)(pred_box_xmax, im_width - 1), zero);
+    pred_box_ymax = (std::max)((std::min)(pred_box_ymax, im_height - 1), zero);
 
     std::vector<T> one_pred;
     one_pred.push_back(pred_box_xmin);
diff --git a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
index 1621d10deb5903b9902aaca9f7093850e00d0b96..1b43099aefae3d2c18128021558087615dd3312f 100644
--- a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cmath>
+
 #include "lite/core/subgraph_bridge_registry.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
diff --git a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
index 3baee3af41268cf2c23cff2a0ad7f6682a88e744..12afe76c29d0c92a24dd66e85b3b7b4fb87a1d3a 100644
--- a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cmath>
+
 #include "lite/core/subgraph_bridge_registry.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
 #include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 5cc79137b9f0e43e659dec745dcc8b98478807b6..95632c7a053aaccd9787456d27a0d71317b44d23 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -53,9 +53,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(output_dims[0], bs);
   CHECK_EQ(output_dims[1], oc);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto groups = op_info->GetAttr<int>("groups");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
   bool with_act =
       op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
   std::string act_type =
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 7e149ed24315b0a87a7e3e6ab4b1c4ad4bb2221a..52ae137d5239dfe3c005b646d99ace3873d5caf1 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -59,8 +59,8 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     output_size = op_info->GetAttr<std::vector<int>>("output_size");
   }
 
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
   CHECK_EQ(dilations.size(), 2L);
   std::string padding_algorithm =
       op_info->HasAttr("padding_algorithm")
diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc
index 70fa87e778204845b90157fd9b468026aaaa2cbe..cb35b24752b82a23e7a7e426c4efac0d48977384 100644
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -35,7 +35,7 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> padding = op_info->GetAttr<std::vector<int>>("paddings");
   CHECK_EQ(padding.size(), 4);
 
   // X node
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index fc2647f67e2a94287c65bcb86dfcd730c81fa296..921e1a25719153a09621f98a8a025e8ae29a93ed 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -39,7 +39,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
 
   // X node
   std::shared_ptr<Node> x_node = nullptr;
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
index 598734267244e36ba4ec32080313c4a7f134ed7b..a608082be06cfff55bb7451450b904a04c3c0c0d 100644
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -36,7 +36,7 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto x_dims = x->dims();
   auto out_name = op_info->Input("Out").front();
   auto keep_dim = op_info->GetAttr<bool>("keep_dim");
-  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  std::vector<int> dim = op_info->GetAttr<std::vector<int>>("dim");
   CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty.";
   for (size_t i = 0; i < dim.size(); i++) {
     if (dim[i] < 0) {
diff --git a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
index e36be300ba5d8b961c0bb9a0ad86ae121bd9e8f2..2199d28716f1420c9396253c7786989e72947953 100644
--- a/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_image_compute_test.cc
@@ -32,6 +32,93 @@ namespace lite {
 // #define TEST_DEPTHWISE_CONV_IMAGE_BASIC
 #define TEST_DEPTHWISE_CONV_IMAGE_3X3
 
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       bool flag_relu) {
+  CHECK(!flag_relu);
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                  /*
+                  if (out_idx == 0) {
+                     VLOG(5) << "src[" << iidx << "]: " << src_data[iidx]
+                             << "\tweights[" << widx << "]: "
+                             << weights_data[widx]
+                             << "\tdst[" << out_idx << "]: "
+                             << dst_data_ref[out_idx];
+                  */
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T, int STRIDE_H = 1, int STRIDE_W = 1>
 void depth_conv(const T* input_data,
                 const lite::DDim& input_dims,
@@ -384,11 +471,14 @@ TEST(depthwise_conv2d, compute_basic) {
 #ifdef TEST_DEPTHWISE_CONV_IMAGE_3X3
 // #define LOOP_TEST
 TEST(depthwise_conv2d, compute_image2d_3x3) {
+  const int fc = 1;
   const int fw = 3;
   const int fh = fw;
-  int dilation = 1;
-  int stride = 1;
-  int pad = 0;
+  const int dilation = 4;
+  const int stride = 2;
+  const int pad = 2;
+  const bool bias_flag = false;
+  const bool relu_flag = false;
 #ifdef LOOP_TEST
   // for (int batch_size = 1; batch_size < 2; ++batch_size) {
   for (int oc = 4; oc < 10; oc += 1) {      // oc = ic
@@ -399,12 +489,18 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
   const int ih = 112;
   const int iw = 112;
 #endif
-        stride = (stride == 1) ? 2 : 1;
-        // pad = (pad == 0) ? 1 : 0;
         const int fb = oc;
         const int ic = oc;
         const int oh = ConvOutputSize(ih, fh, dilation, pad, pad, stride);
         const int ow = ConvOutputSize(iw, fw, dilation, pad, pad, stride);
+        if (oh <= 0 || ow <= 0) {
+#ifdef LOOP_TEST
+          continue;
+#else
+    LOG(FATAL) << "Output tensor of depthwise conv is illegal!"
+               << "Please check your input dims and conv params";
+#endif
+        }
 
         LOG(INFO) << "to get kernel ...";
         auto kernels =
@@ -417,7 +513,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         auto kernel = std::move(kernels.front());
 
         LOG(INFO) << "get kernel";
-        lite::Tensor input, filter, output;
+        lite::Tensor input, filter, bias, output;
         operators::ConvParam param;
         param.x = &input;
         param.filter = &filter;
@@ -428,6 +524,8 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         param.strides = std::vector<int>{stride, stride};
         std::vector<int> dilations = {dilation, dilation};
         param.dilations = std::make_shared<std::vector<int>>(dilations);
+        param.bias = bias_flag ? &bias : nullptr;
+        param.fuse_relu = relu_flag;
 
         std::unique_ptr<KernelContext> context(new KernelContext);
         context->As<OpenCLContext>().InitOnce();
@@ -442,9 +540,11 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         const DDim& input_dim =
             lite::DDim{std::vector<int64_t>({1, ic, ih, iw})};
         const DDim& filter_dim =
-            lite::DDim{std::vector<int64_t>({fb, 1, 3, 3})};
+            lite::DDim{std::vector<int64_t>({fb, fc, fh, fw})};
         const DDim& output_dim =
             lite::DDim{std::vector<int64_t>({1, oc, oh, ow})};
+        // element wise bias
+        const DDim bias_dim = DDim(std::vector<DDim::value_type>{oc});
         input.Resize(input_dim);
         filter.Resize(filter_dim);
         output.Resize(output_dim);
@@ -460,6 +560,14 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         for (auto& f : filter_v) {
           f = gen(engine);
         }
+        std::vector<float> bias_v;
+        if (bias_flag) {
+          bias.Resize(bias_dim);
+          bias_v.resize(bias_dim.production());
+          for (auto& b : bias_v) {
+            b = gen(engine);
+          }
+        }
 
         LOG(INFO) << "prepare input";
         CLImageConverterDefault* default_converter =
@@ -496,21 +604,29 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
         lite::Tensor out_ref;
         out_ref.Resize(output_dim);
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
-        if (stride == 1) {
-          depth_conv<float, 1, 1>(input_v.data(),
-                                  input.dims(),
-                                  filter_v.data(),
-                                  filter.dims(),
-                                  out_ref_data,
-                                  out_ref.dims());
-        } else if (stride == 2) {
-          depth_conv<float, 2, 2>(input_v.data(),
-                                  input.dims(),
-                                  filter_v.data(),
-                                  filter.dims(),
-                                  out_ref_data,
-                                  out_ref.dims());
-        }
+
+        conv_basic<float, float>(input_v.data(),
+                                 out_ref_data,
+                                 1,
+                                 oc,
+                                 oh,
+                                 ow,
+                                 ic,
+                                 ih,
+                                 iw,
+                                 filter_v.data(),
+                                 bias_v.data(),
+                                 param.groups,
+                                 fw,
+                                 fh,
+                                 stride,
+                                 stride,
+                                 dilation,
+                                 dilation,
+                                 pad,
+                                 pad,
+                                 bias_flag,
+                                 relu_flag);
 
         const size_t cl_image2d_row_pitch{0};
         const size_t cl_image2d_slice_pitch{0};
@@ -538,7 +654,7 @@ TEST(depthwise_conv2d, compute_image2d_3x3) {
           EXPECT_FALSE(relative_diff > FP16_MAX_DIFF &&
                        abs_diff > FP16_ABS_DIFF);
           if (relative_diff > FP16_MAX_DIFF && abs_diff > FP16_ABS_DIFF) {
-            LOG(FATAL) << "error idx:" << i << "output_v[" << i
+            LOG(FATAL) << "error idx:" << i << " output_v[" << i
                        << "]:" << output_v[i] << " "
                                                  "out_ref_data["
                        << i << "]:" << out_ref_data[i];
diff --git a/lite/kernels/opencl/dropout_image_compute.cc b/lite/kernels/opencl/dropout_image_compute.cc
index c3fdba3c1363141b5dec4a73fa86985120a1e48a..c654129727494d4c0308e21546663b5ab4ac6779 100644
--- a/lite/kernels/opencl/dropout_image_compute.cc
+++ b/lite/kernels/opencl/dropout_image_compute.cc
@@ -136,4 +136,5 @@ REGISTER_LITE_KERNEL(dropout,
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kFP16),
                                        DATALAYOUT(kImageDefault))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/rknpu/bridges/conv_op.cc b/lite/kernels/rknpu/bridges/conv_op.cc
index 134d9e0cdeffbee5125007ff22f78b274a9f9ff7..a789f0bacc12bff2f08fc567b3d7f500c13e1da3 100644
--- a/lite/kernels/rknpu/bridges/conv_op.cc
+++ b/lite/kernels/rknpu/bridges/conv_op.cc
@@ -51,9 +51,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(output_dims[0], bs);
   CHECK_EQ(output_dims[1], oc);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto groups = op_info->GetAttr<int>("groups");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
   CHECK_EQ(strides.size(), 2L);
   CHECK_EQ(dilations.size(), 2L);
diff --git a/lite/kernels/rknpu/bridges/pool_op.cc b/lite/kernels/rknpu/bridges/pool_op.cc
index 36832fc578a3243145491112c674a1684748748e..1a5a69b1342c51f7a19e4e5ed6ffdc2e6c796c2a 100644
--- a/lite/kernels/rknpu/bridges/pool_op.cc
+++ b/lite/kernels/rknpu/bridges/pool_op.cc
@@ -42,7 +42,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
 
   // for quantization
   bool enable_int8 = false;
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index 521fbb6b24dccb27ce79369ddc631097434105e5..c98f789911fde831a843a5845953f0b863d118f1 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -68,6 +68,8 @@ add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_av
 add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc)
 
 add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} blas)
+add_kernel(box_coder_compute_x86 X86 basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} box_coder)
+add_kernel(density_prior_box_compute_x86 X86 basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} prior_box)
 
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86)
diff --git a/lite/kernels/x86/box_coder_compute.cc b/lite/kernels/x86/box_coder_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db58bf01cbc3f0ca5ea1fa20fbc205a4076eafa8
--- /dev/null
+++ b/lite/kernels/x86/box_coder_compute.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/box_coder_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/x86/math/box_coder.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void BoxCoderCompute::Run() {
+  auto& param = *param_.get_mutable<operators::BoxCoderParam>();
+  // required inputs
+  auto* prior_box = param.prior_box;    // M x 4 => M x [xmin, ymin, xmax, ymax]
+  auto* target_box = param.target_box;  // encode_center_size => N x 4;
+                                        // decode_center_size => N x M x 4
+  // optional input
+  auto* prior_box_var = param.prior_box_var;  // M x 4 or 4
+  // output
+  auto* output_box = param.proposals;  // N x M x 4
+  // required attributes
+  std::string code_type = param.code_type;
+  bool normalized = param.box_normalized;
+  // optional attributes
+  std::vector<float> variance = param.variance;
+  const int axis = param.axis;
+
+  auto row = target_box->dims()[0];         // N
+  auto col = prior_box->dims()[0];          // M
+  if (code_type == "decode_center_size") {  // same as target_box
+    col = target_box->dims()[1];
+  }
+  auto len = prior_box->dims()[1];      // 4
+  output_box->Resize({row, col, len});  // N x M x 4
+  auto* output = output_box->mutable_data<float>();
+
+  const float* target_box_data = target_box->data<float>();
+  const float* prior_box_data = prior_box->data<float>();
+  const float* prior_box_var_data =
+      prior_box_var ? prior_box_var->data<float>() : nullptr;
+
+  if (code_type == "encode_center_size") {
+    lite::x86::math::encode_center_size(row,
+                                        col,
+                                        len,
+                                        target_box_data,
+                                        prior_box_data,
+                                        prior_box_var_data,
+                                        normalized,
+                                        variance,
+                                        output);
+  } else if (code_type == "decode_center_size") {
+    int var_size = 0;
+    if (prior_box_var) {
+      var_size = 2;
+    } else if (!(variance.empty())) {
+      var_size = 1;
+    }
+    lite::x86::math::decode_center_size(axis,
+                                        var_size,
+                                        row,
+                                        col,
+                                        len,
+                                        target_box_data,
+                                        prior_box_data,
+                                        prior_box_var_data,
+                                        normalized,
+                                        variance,
+                                        output);
+  } else {
+    LOG(FATAL) << "box_coder don't support this code_type: " << code_type;
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(box_coder,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::BoxCoderCompute,
+                     def)
+    .BindInput("PriorBox", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("PriorBoxVar", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("TargetBox", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("OutputBox", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/box_coder_compute.h b/lite/kernels/x86/box_coder_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..34c655bf4bacf537415758651d8c0d8182b047f4
--- /dev/null
+++ b/lite/kernels/x86/box_coder_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+class BoxCoderCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BoxCoderParam;
+
+  void Run() override;
+
+  virtual ~BoxCoderCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/density_prior_box_compute.cc b/lite/kernels/x86/density_prior_box_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f76e20bbf68c1dc6ff10fb15e2c125629dbe83c
--- /dev/null
+++ b/lite/kernels/x86/density_prior_box_compute.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/density_prior_box_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/x86/math/prior_box.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void DensityPriorBoxCompute::Run() {
+  auto& param = *param_.get_mutable<operators::DensityPriorBoxParam>();
+  // required inputs
+  auto* input = param.input;  // 4D tensor NCHW
+  auto* image = param.image;  // 4D tensor NCHW
+  // outputs
+  auto* boxes = param.boxes;     // [H, W, num_priors, 4]
+  auto* vars = param.variances;  // [H, W, num_priors, 4]
+  // required attributes
+  bool clip = param.clip;
+  std::vector<float> variances = param.variances_;
+  std::vector<float> fixed_sizes = param.fixed_sizes;
+  std::vector<float> fixed_ratios = param.fixed_ratios;
+  std::vector<int> densities = param.density_sizes;
+  // optional attributes
+  float step_w = param.step_w;
+  float step_h = param.step_h;
+  float offset = param.offset;
+
+  auto img_width = image->dims()[3];
+  auto img_height = image->dims()[2];
+
+  auto feature_width = input->dims()[3];
+  auto feature_height = input->dims()[2];
+
+  float step_width, step_height;
+  if (step_w == 0 || step_h == 0) {
+    step_width = static_cast<float>(img_width) / feature_width;
+    step_height = static_cast<float>(img_height) / feature_height;
+  } else {
+    step_width = step_w;
+    step_height = step_h;
+  }
+  int num_priors = 0;
+
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for reduction(+ : num_priors)
+#endif
+  for (size_t i = 0; i < densities.size(); ++i) {
+    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
+  }
+
+  boxes->Resize({feature_height, feature_width, num_priors, 4});
+  vars->Resize({feature_height, feature_width, num_priors, 4});
+  auto* boxes_data = boxes->mutable_data<float>();
+  auto* vars_data = vars->mutable_data<float>();
+
+  const float* input_data = input->data<float>();
+  const float* image_data = image->data<float>();
+
+  lite::x86::math::density_prior_box(img_width,
+                                     img_height,
+                                     feature_width,
+                                     feature_height,
+                                     input_data,
+                                     image_data,
+                                     clip,
+                                     variances,
+                                     fixed_sizes,
+                                     fixed_ratios,
+                                     densities,
+                                     step_width,
+                                     step_height,
+                                     offset,
+                                     num_priors,
+                                     boxes_data,
+                                     vars_data);
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(density_prior_box,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::DensityPriorBoxCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Image", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/density_prior_box_compute.h b/lite/kernels/x86/density_prior_box_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..715f0aa99a1e54d79b5bbde6562f4860066f5889
--- /dev/null
+++ b/lite/kernels/x86/density_prior_box_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+class DensityPriorBoxCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DensityPriorBoxParam;
+
+  void Run() override;
+
+  virtual ~DensityPriorBoxCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h
index 4cb7160097e320798c1b1e2ee94d7fec8aedc6d6..e4b5a4b10ebc44938e094f999ce042c90597b108 100644
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -71,7 +71,7 @@ inline void get_mid_dims(const lite::DDim &x_dims,
         for (size_t j = 0; j < i; ++j) {
           (*pre) *= y_dims[j];
         }
-        *n = std::max(x_dims[i + axis], y_dims[i]);
+        *n = (std::max)(x_dims[i + axis], y_dims[i]);
         *mid_flag = 1;
         mid = i;
         break;
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h
index 080d0bcd0b42f6f59266e56d0f729eb2a28d4179..12622a917b9c4f55dbcb1e10912ad7ea6fab25a4 100644
--- a/lite/kernels/x86/sequence_arithmetic_compute.h
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -55,7 +55,7 @@ class SequenceArithmeticCompute
         auto input_x = x_data + x_seq_offset[i] * inner_size;
         auto input_y = y_data + y_seq_offset[i] * inner_size;
         auto t_out = out_data + x_seq_offset[i] * inner_size;
-        int len = std::min(len_x, len_y);
+        int len = (std::min)(len_x, len_y);
         for (int j = 0; j < len; j++) {
           t_out[j] = input_x[j] + input_y[j];
         }
@@ -73,7 +73,7 @@ class SequenceArithmeticCompute
         auto input_x = x_data + x_seq_offset[i] * inner_size;
         auto input_y = y_data + y_seq_offset[i] * inner_size;
         auto t_out = out_data + x_seq_offset[i] * inner_size;
-        int len = std::min(len_x, len_y);
+        int len = (std::min)(len_x, len_y);
         for (int j = 0; j < len; j++) {
           t_out[j] = input_x[j] - input_y[j];
         }
@@ -91,7 +91,7 @@ class SequenceArithmeticCompute
         auto input_x = x_data + x_seq_offset[i] * inner_size;
         auto input_y = y_data + y_seq_offset[i] * inner_size;
         auto t_out = out_data + x_seq_offset[i] * inner_size;
-        int len = std::min(len_x, len_y);
+        int len = (std::min)(len_x, len_y);
         for (int j = 0; j < len; j++) {
           t_out[j] = input_x[j] * input_y[j];
         }
diff --git a/lite/kernels/x86/sequence_conv_compute.h b/lite/kernels/x86/sequence_conv_compute.h
index c1a47aa20f4886aa5dddbe6b398e5365abdc16f2..dd0a60583cc04c9032ceda50e7398b2d1b008b9e 100644
--- a/lite/kernels/x86/sequence_conv_compute.h
+++ b/lite/kernels/x86/sequence_conv_compute.h
@@ -49,8 +49,8 @@ class SequenceConvCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     bool padding_trainable = false;
     const Tensor* padding_data = nullptr;
 
-    int up_pad = std::max(0, -context_start);
-    int down_pad = std::max(0, context_start + context_length - 1);
+    int up_pad = (std::max)(0, -context_start);
+    int down_pad = (std::max)(0, context_start + context_length - 1);
     auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
     std::vector<int64_t> col_shape{in->dims()[0],
diff --git a/lite/kernels/x86/sequence_unpad_compute.h b/lite/kernels/x86/sequence_unpad_compute.h
index 5b4e3f6c1638975ec042598942363f516ddf3bb9..b8bdfe08e82629a839663efc63d8aee131dadea8 100644
--- a/lite/kernels/x86/sequence_unpad_compute.h
+++ b/lite/kernels/x86/sequence_unpad_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include <vector>
 #include "lite/backends/x86/math/sequence_padding.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
@@ -34,6 +35,30 @@ class SequenceUnpadCompute
     auto& param = this->template Param<param_t>();
     auto& ctx = this->ctx_->template As<X86Context>();
 
+    auto x_dims = param.X->dims();
+    auto len_dims = param.Length->dims();
+
+    auto* seq_len_ptr = param.Length->template data<int64_t>();
+    int64_t batch_size = len_dims[0];
+    std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+    for (int64_t i = 0; i < batch_size; ++i) {
+      out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+    }
+    paddle::lite::LoD out_lod;
+    out_lod.push_back(out_lod0);
+
+    int64_t out_dim0 = out_lod0.back();
+    std::vector<int64_t> out_dims{out_dim0};
+    if (x_dims.size() == 2) {
+      out_dims.push_back(1);
+    } else {
+      for (size_t i = 2; i < x_dims.size(); ++i) {
+        out_dims.push_back(x_dims[i]);
+      }
+    }
+    param.Out->Resize(out_dims);
+    param.Out->set_lod(out_lod);
+
     param.Out->template mutable_data<T>();
     int64_t padded_length = param.X->dims()[1];
     math::UnpaddingLoDTensorFunctor<lite::TargetType::kX86, T>()(
diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h
index d32327668bac389e42ff9411be50ce3df42e39ff..4e9870d53f68828c0159f9ec4fa78af02af4e1cb 100644
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
@@ -102,9 +102,9 @@ void slice_compute(const lite::Tensor* in,
 
         start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
         end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim_value);
+        start = (std::max)(start, 0);
+        end = (std::max)(end, 0);
+        end = (std::min)(end, dim_value);
         CHECK_GT(end, start) << "end should greater than start";
         out_dims[axes[i]] = end - start;
       }
@@ -172,7 +172,7 @@ void slice_compute(const lite::Tensor* in,
     if (start < 0) {
       start = (start + in_dims[axes[i]]);
     }
-    start = std::max(start, 0);
+    start = (std::max)(start, 0);
     offsets[axes[i]] = start;
   }
   auto in_t =
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 798d707dd7021ccc26b5330619cd9ab9e0a229aa..cc691205570e7a640e22568e053c01daa36aa370 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -38,6 +38,7 @@ else()
   add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_unpad_compute_xpu XPU extra SRCS sequence_unpad_compute.cc DEPS ${lite_kernel_deps})
 
   # extra(fused kernel)
   add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/xpu/__xpu__conv2d_compute.cc b/lite/kernels/xpu/__xpu__conv2d_compute.cc
index 3d73832937cf0e5f83d9e82ca769ddcd86e06cad..3c86381a62d65811226b870633547023d7109f3d 100644
--- a/lite/kernels/xpu/__xpu__conv2d_compute.cc
+++ b/lite/kernels/xpu/__xpu__conv2d_compute.cc
@@ -48,8 +48,9 @@ void XPUConv2dCompute::Run() {
   std::string filter_type = param.filter_type;
   int groups = param.groups;
 
-  int act_type = (param.act_type == -1) ? xdnn::Activation_t::RELU
-                                        : param.act_type;  // -1 means not init
+  int act_type = (param.act_type == "relu")
+                     ? xdnn::Activation_t::RELU
+                     : xdnn::Activation_t::LINEAR;  // -1 means not init
   const auto* bias = param.Bias ? param.Bias->data<float>() : nullptr;
   const auto* branch = param.Branch ? param.Branch->data<float>() : nullptr;
   const float* input_max =
@@ -60,7 +61,6 @@ void XPUConv2dCompute::Run() {
   float* output = param.Output->mutable_data<float>(TARGET(kXPU));
 
   // TODO(luohang): now support for resnet50 first
-  CHECK_EQ(act_type, xdnn::Activation_t::RELU);
   CHECK_EQ(groups, 1);
   CHECK_EQ(filter_type, "int16");
 
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.cc b/lite/kernels/xpu/__xpu__resnet50_compute.cc
index 2e63e03fc9c1d52be42a8ff9b1d6260b3396a2fe..baa97f86606918d0e44765680f5a01abe5113615 100644
--- a/lite/kernels/xpu/__xpu__resnet50_compute.cc
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.cc
@@ -34,6 +34,21 @@ void XPUResNet50Compute::PrepareForRun() {
   }
 }
 
+void XPUResNet50DtypeCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* filter : param.filter) {
+    arg_filter_.push_back(
+        reinterpret_cast<const int16_t*>(filter->data<float>()));
+  }
+  for (auto* bias : param.bias) {
+    arg_bias_.push_back(bias->data<float>());
+  }
+  for (auto* max_filter : param.max_filter) {
+    arg_max_filter_.push_back(max_filter->data<float>());
+  }
+}
+
 void XPUResNet50Compute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
@@ -50,6 +65,22 @@ void XPUResNet50Compute::Run() {
   CHECK_EQ(r, 0);
 }
 
+void XPUResNet50DtypeCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int batch_size = param.input->dims()[0];
+  int r = xdnn::conv2d_int16_resnet_d<float, int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* num */
+      param.input->data<float>(),                      /* bottom */
+      &arg_filter_[0],                                 /* weight_list */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      &arg_bias_[0],                                   /* bias_list */
+      &arg_max_filter_[0] /* max_filter_list */);
+  CHECK_EQ(r, 0);
+}
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -67,3 +98,16 @@ REGISTER_LITE_KERNEL(__xpu__resnet50,
     .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__resnet50_d,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUResNet50DtypeCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__resnet50_compute.h b/lite/kernels/xpu/__xpu__resnet50_compute.h
index 7ce8b1192ea9e85d83ddbeddc374378692866aa6..d12616ea89de9c15c671e8fc39e55ad334945d66 100644
--- a/lite/kernels/xpu/__xpu__resnet50_compute.h
+++ b/lite/kernels/xpu/__xpu__resnet50_compute.h
@@ -38,6 +38,21 @@ class XPUResNet50Compute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   std::vector<const float *> arg_bias_;
 };
 
+class XPUResNet50DtypeCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUResNet50Param;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_filter_;
+  std::vector<const float *> arg_max_filter_;
+  std::vector<const float *> arg_bias_;
+};
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/xpu/activation_compute.cc b/lite/kernels/xpu/activation_compute.cc
index fa20cbd60b37a0ebcc1c708daefcfff316465227..e6fc78d233fc26f1f80e5940605a088712938660 100644
--- a/lite/kernels/xpu/activation_compute.cc
+++ b/lite/kernels/xpu/activation_compute.cc
@@ -73,6 +73,19 @@ void AbsCompute::Run() {
   CHECK_EQ(r, 0);
 }
 
+void ExpCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),     /* context */
+      xdnn::Activation_t::EXP, /* type */
+      param.X->numel(),        /* len */
+      param.X->data<float>(),  /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
 void SquareCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
@@ -86,6 +99,19 @@ void SquareCompute::Run() {
   CHECK_EQ(r, 0);
 }
 
+void ReciprocalCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  int r = xdnn::activation_forward(
+      ctx.GetRawContext(),            /* context */
+      xdnn::Activation_t::RECIPROCAL, /* type */
+      param.X->numel(),               /* len */
+      param.X->data<float>(),         /* x */
+      param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
+  CHECK_EQ(r, 0);
+}
+
 void SqrtCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
@@ -103,11 +129,14 @@ void PowCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
 
+  xdnn::Activation_t act_type(xdnn::Activation_t::ACT_POW);
+  act_type.pow_factor = param.factor;
+
   int r = xdnn::activation_forward(
-      ctx.GetRawContext(),         /* context */
-      xdnn::Activation_t::ACT_POW, /* type */
-      param.X->numel(),            /* len */
-      param.X->data<float>(),      /* x */
+      ctx.GetRawContext(),    /* context */
+      act_type,               /* type */
+      param.X->numel(),       /* len */
+      param.X->data<float>(), /* x */
       param.Out->mutable_data<float>(TARGET(kXPU)) /* y */);
   CHECK_EQ(r, 0);
 }
@@ -158,6 +187,12 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(
+    exp, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ExpCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     square, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SquareCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
@@ -181,3 +216,13 @@ REGISTER_LITE_KERNEL(
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(reciprocal,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::ReciprocalCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/activation_compute.h b/lite/kernels/xpu/activation_compute.h
index df4a5d3f8d9cbebdc3ac63a91602b370b48ee629..0623f8ba177bd7c23be23748779919077c996c36 100644
--- a/lite/kernels/xpu/activation_compute.h
+++ b/lite/kernels/xpu/activation_compute.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -57,6 +56,15 @@ class AbsCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~AbsCompute() = default;
 };
 
+class ExpCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~ExpCompute() = default;
+};
+
 class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
   using param_t = operators::ActivationParam;
@@ -66,6 +74,15 @@ class SquareCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
   virtual ~SquareCompute() = default;
 };
 
+class ReciprocalCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  virtual void Run();
+
+  virtual ~ReciprocalCompute() = default;
+};
+
 class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
   using param_t = operators::ActivationParam;
@@ -77,7 +94,7 @@ class SqrtCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
 
 class PowCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
  public:
-  using param_t = operators::ActivationParam;
+  using param_t = operators::PowParam;
 
   virtual void Run();
 
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index a4c0bc05cb4c6725739fb141f4b9e4ac1e93fe9c..590d830ce42a80262f7b097cae43d9b1efcd918c 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -44,9 +44,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(input_dims.size(), 4);
   CHECK_EQ(filter_dims.size(), 4);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto groups = op_info->GetAttr<int>("groups");
-  auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
+  std::vector<int> dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
   CHECK_EQ(strides.size(), 2L);
   CHECK_EQ(dilations.size(), 2L);
diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc
index 862e1841e829833cfb76082cc050ea768e4ef9f0..5c38cacdddc8478e318e03e1d184349c32e67520 100644
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -37,7 +37,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto out_name = op_info->Output("Out").front();
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
-  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc
index b7d3588a3ed18589c6ec7601992b7ba468842429..b829152a180cf1026ca41806bfea7f8d69ac7ff8 100644
--- a/lite/kernels/xpu/elementwise_compute.cc
+++ b/lite/kernels/xpu/elementwise_compute.cc
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #include "lite/kernels/xpu/elementwise_compute.h"
+#include <algorithm>
 #include <functional>
+#include <string>
+#include <vector>
 #include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
@@ -22,113 +26,300 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
+inline DDim TrimTrailingSingularDims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  DDim actual_dims = DDim(trim_dims);
+  return actual_dims;
+}
+
+inline void GetMidDims(const DDim& x_dims,
+                       const DDim& y_dims,
+                       const int axis,
+                       int* pre,
+                       int* n,
+                       int* post,
+                       int* mid_flag = NULL) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  if (mid_flag != NULL) {
+    *mid_flag = 0;
+    int mid = 0;
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= x_dims[i];
+    }
+    for (int i = 0; i < y_dims.size(); ++i) {
+      if (x_dims[i + axis] != y_dims[i]) {
+        // only support single y_dims[i] = 1 now.
+        CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1.";
+        CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch.";
+        // m*n*k m*1*k
+        for (int j = 0; j < i; ++j) {
+          (*pre) *= y_dims[j];
+        }
+        *n = std::max(x_dims[i + axis], y_dims[i]);
+        *mid_flag = 1;
+        mid = i;
+        break;
+      }
+      (*n) *= y_dims[i];
+    }
+    if (*mid_flag) {
+      for (int i = mid + 1; i < x_dims.size(); ++i) {
+        (*post) *= x_dims[i];
+      }
+    } else {
+      for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+        (*post) *= x_dims[i];
+      }
+    }
+  } else {
+    for (int i = 0; i < axis; ++i) {
+      (*pre) *= x_dims[i];
+    }
+
+    for (int i = 0; i < y_dims.size(); ++i) {
+      CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
+      (*n) *= y_dims[i];
+    }
+
+    for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+      (*post) *= x_dims[i];
+    }
+  }
+}
+
 void ElementwiseAddCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
 
-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
   auto& y_dims = param.Y->dims();
   int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (post == 1) {
+    int r =
+        xdnn::matrix_vector_add(ctx.GetRawContext(),
+                                param.X->data<float>(),
+                                param.Y->data<float>(),
+                                param.Out->mutable_data<float>(TARGET(kXPU)),
+                                pre,
+                                n);
+    CHECK_EQ(r, 0);
+    return;
   }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_add(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+  if (pre != 1 || post != 1) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
+    CHECK_EQ(r, 0);
+    r = xdnn::elementwise_add(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
     CHECK_EQ(r, 0);
+    return;
   }
+  int r = xdnn::elementwise_add(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }
 
-void ElementwiseSubCompute::Run() {
+void ElementwiseMulCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
 
-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
   auto& y_dims = param.Y->dims();
   int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (post == 1) {
+    int r =
+        xdnn::matrix_vector_mul(ctx.GetRawContext(),
+                                param.X->data<float>(),
+                                param.Y->data<float>(),
+                                param.Out->mutable_data<float>(TARGET(kXPU)),
+                                pre,
+                                n);
+    CHECK_EQ(r, 0);
+    return;
   }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_sub(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+  if (pre != 1 || post != 1) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
     CHECK_EQ(r, 0);
+    r = xdnn::elementwise_mul(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
+    CHECK_EQ(r, 0);
+    return;
   }
+  int r = xdnn::elementwise_mul(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }
 
-void ElementwiseDivCompute::Run() {
+void ElementwiseSubCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
 
-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
   auto& y_dims = param.Y->dims();
   int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_div(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (len != param.Y->numel()) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
     CHECK_EQ(r, 0);
+    r = xdnn::elementwise_sub(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
+    CHECK_EQ(r, 0);
+    return;
   }
+  int r = xdnn::elementwise_sub(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }
 
-void ElementwiseMulCompute::Run() {
+void ElementwiseDivCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
 
-  auto& x_dims = param.X->dims().data();
+  auto& x_dims = param.X->dims();
   auto& y_dims = param.Y->dims();
   int axis = param.axis;
-  if (param.axis == -1) {
-    axis = x_dims.size() - y_dims.size();
-  }
-  int iter = std::accumulate(
-      x_dims.begin(), x_dims.begin() + axis, 1, std::multiplies<int>());
-  int stride = param.Y->numel();
-
-  for (int i = 0; i < iter; ++i) {
-    const float* x_ptr = param.X->data<float>() + i * stride;
-    const float* y_ptr = param.Y->data<float>();
-    float* o_ptr = param.Out->mutable_data<float>(TARGET(kXPU)) + i * stride;
-    int r = xdnn::elementwise_mul(ctx.GetRawContext(), /* context */
-                                  x_ptr,               /* x */
-                                  y_ptr,               /* y */
-                                  o_ptr,               /* z */
-                                  stride /* len */);
+
+  auto y_dims_untrimed = y_dims;
+  axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
+  auto y_dims_after_trailing = TrimTrailingSingularDims(y_dims_untrimed);
+  axis = (y_dims_after_trailing.size() == 0) ? x_dims.size() : axis;
+  int pre, n, post;
+  GetMidDims(x_dims, y_dims_after_trailing, axis, &pre, &n, &post);
+  int len = pre * n * post;
+  float* y_broadcast = nullptr;
+
+  if (len != param.Y->numel()) {
+    XPUScratchPadGuard y_broadcast_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(len * sizeof(float),
+                                           false /* use_l3 */);
+    y_broadcast = reinterpret_cast<float*>(y_broadcast_xpu_guard_->addr_);
+
+    int r = xdnn::broadcast_ew(ctx.GetRawContext(),
+                               param.Y->data<float>(),
+                               y_broadcast,
+                               pre,
+                               n,
+                               post,
+                               xdnn::ElementwiseOp::ASSIGN);
+    CHECK_EQ(r, 0);
+    r = xdnn::elementwise_div(
+        ctx.GetRawContext(),                          /* context */
+        param.X->data<float>(),                       /* x */
+        y_broadcast,                                  /* y */
+        param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+        len);
     CHECK_EQ(r, 0);
+    return;
   }
+  int r = xdnn::elementwise_div(
+      ctx.GetRawContext(),                          /* context */
+      param.X->data<float>(),                       /* x */
+      param.Y->data<float>(),                       /* y */
+      param.Out->mutable_data<float>(TARGET(kXPU)), /* z */
+      len);
+  CHECK_EQ(r, 0);
 }
+
 }  // namespace xpu
 }  // namespace kernels
 }  // namespace lite
@@ -145,33 +336,33 @@ REGISTER_LITE_KERNEL(elementwise_add,
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_sub,
+REGISTER_LITE_KERNEL(elementwise_mul,
                      kXPU,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
+                     paddle::lite::kernels::xpu::ElementwiseMulCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_div,
+REGISTER_LITE_KERNEL(elementwise_sub,
                      kXPU,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::xpu::ElementwiseDivCompute,
+                     paddle::lite::kernels::xpu::ElementwiseSubCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_mul,
+REGISTER_LITE_KERNEL(elementwise_div,
                      kXPU,
                      kFloat,
                      kNCHW,
-                     paddle::lite::kernels::xpu::ElementwiseMulCompute,
+                     paddle::lite::kernels::xpu::ElementwiseDivCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc
index f8e71639b7f4c67f7e60103a42766a4d32026bc1..35412cf49c5b41adb2664180bd703d8475463669 100644
--- a/lite/kernels/xpu/sequence_pool_compute.cc
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
@@ -42,6 +42,8 @@ void XPUSequencePoolCompute::Run() {
 
   xdnn::Pooling_t pool_type = xdnn::Pooling_t::MAX_WITHOUT_INDEX;
   if (pool_type_str == "MAX") {
+  } else if (pool_type_str == "SUM") {
+    pool_type = xdnn::Pooling_t::SUM;
   } else if (pool_type_str == "LAST") {
     pool_type = xdnn::Pooling_t::LAST;
   } else {
diff --git a/lite/kernels/xpu/sequence_unpad_compute.cc b/lite/kernels/xpu/sequence_unpad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ce296ca2163748abb8702460a0dea84c659c1d8
--- /dev/null
+++ b/lite/kernels/xpu/sequence_unpad_compute.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_unpad_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceUnpadCompute::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(
+      XPU_MAX_LOD_SIZE * sizeof(int), false /* use_l3 */);
+  lod_cpu_.reserve(XPU_MAX_LOD_SIZE);
+}
+
+void SequenceUnpadCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto x_dims = param.X->dims();
+  auto len_dims = param.Length->dims();
+
+  // XXX(miaotianxiang): Target of tensor |Length| is |kHost|.
+  auto* seq_len_ptr = param.Length->template data<int64_t>();
+  int64_t batch_size = len_dims[0];
+  std::vector<uint64_t> out_lod0(batch_size + 1, 0);
+  for (int64_t i = 0; i < batch_size; ++i) {
+    out_lod0[i + 1] = out_lod0[i] + seq_len_ptr[i];
+  }
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(out_lod0);
+
+  int64_t out_dim0 = out_lod0.back();
+  std::vector<int64_t> out_dims{out_dim0};
+  if (x_dims.size() == 2) {
+    out_dims.push_back(1);
+  } else {
+    for (size_t i = 2; i < x_dims.size(); ++i) {
+      out_dims.push_back(x_dims[i]);
+    }
+  }
+  param.Out->Resize(out_dims);
+  param.Out->set_lod(out_lod);
+
+  lod_cpu_ = {0};
+  for (int64_t i = 0; i < batch_size; ++i) {
+    int offset =
+        lod_cpu_.back() + static_cast<int>(param.Length->data<int64_t>()[i]);
+    lod_cpu_.push_back(offset);
+  }
+  lod_xpu_guard_->Reserve((batch_size + 1) * sizeof(int));
+  TargetWrapperXPU::MemcpySync(lod_xpu_guard_->addr_,
+                               lod_cpu_.data(),
+                               (batch_size + 1) * sizeof(int),
+                               IoDirection::HtoD);
+
+  int dim = param.Out->numel() / out_dim0;
+  int r = xdnn::sequence_unpad(
+      ctx.GetRawContext(),                           /* ctx */
+      param.X->data<float>(),                        /* pad_data */
+      param.Out->mutable_data<float>(TARGET(kXPU)),  /* seq_data */
+      reinterpret_cast<int*>(lod_xpu_guard_->addr_), /* sequence */
+      param.X->dims()[1],                            /* pad_seq_len */
+      batch_size,                                    /* batch_size */
+      dim /* dim */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_unpad,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceUnpadCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_unpad_compute.h b/lite/kernels/xpu/sequence_unpad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e038383e6ff52e552fc6d53fc74216e02d3caf1
--- /dev/null
+++ b/lite/kernels/xpu/sequence_unpad_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceUnpadCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceUnpadParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+  std::vector<int> lod_cpu_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/block_desc.h b/lite/model_parser/base/block_desc.h
index b3d2e2452714d474e9d6bc9280cb2c5455fecc98..530111a5155ab9a7facbffb2562a1b490296af64 100644
--- a/lite/model_parser/base/block_desc.h
+++ b/lite/model_parser/base/block_desc.h
@@ -17,6 +17,7 @@
 #include <cstdint>
 #include <string>
 #include <vector>
+#include "lite/model_parser/base/traits.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -47,30 +48,29 @@ class BlockDescReadAPI {
 
 class BlockDescWriteAPI {
  public:
-  virtual void SetIdx(int32_t idx) { NotImplemented(); }
-  virtual void SetParentIdx(int32_t idx) { NotImplemented(); }
-  virtual void ClearVars() { NotImplemented(); }
-  virtual void ClearOps() { NotImplemented(); }
-  virtual void SetForwardBlockIdx(int32_t idx) { NotImplemented(); }
+  virtual void SetIdx(int32_t idx) { LITE_MODEL_INTERFACE_NOT_IMPLEMENTED; }
+  virtual void SetParentIdx(int32_t idx) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual void ClearVars() { LITE_MODEL_INTERFACE_NOT_IMPLEMENTED; }
+  virtual void ClearOps() { LITE_MODEL_INTERFACE_NOT_IMPLEMENTED; }
+  virtual void SetForwardBlockIdx(int32_t idx) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
 
   template <typename T>
   T* AddVar() {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
   template <typename T>
   T* AddOp() {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
   virtual ~BlockDescWriteAPI() = default;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL) << "BlockDescWriteAPI is not available in model read-only mode.";
-  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/op_desc.h b/lite/model_parser/base/op_desc.h
index 534ff0feabd2234b4d7a72894383020a5f64d594..f40fd9612d443c131dcd3c310bd757216db8f7fb 100644
--- a/lite/model_parser/base/op_desc.h
+++ b/lite/model_parser/base/op_desc.h
@@ -62,27 +62,24 @@ class OpDescReadAPI {
 
 class OpDescWriteAPI {
  public:
-  virtual void SetType(const std::string& type) { NotImplemented(); }
+  virtual void SetType(const std::string& type) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
   virtual void SetInput(const std::string& param,
                         const std::vector<std::string>& args) {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
   }
   virtual void SetOutput(const std::string& param,
                          const std::vector<std::string>& args) {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
   }
 
   template <typename T>
   void SetAttr(const std::string& name, const T& v) {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
   }
 
   virtual ~OpDescWriteAPI() = default;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL) << "OpDescWriteAPI is not available in model read-only mode.";
-  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/param_desc.h b/lite/model_parser/base/param_desc.h
index 1c40ba3e89e02310d4e45d2ee9c6a78b3765607c..336030370fa14d64a4cb3724bcb8df8efecc3f70 100644
--- a/lite/model_parser/base/param_desc.h
+++ b/lite/model_parser/base/param_desc.h
@@ -34,17 +34,20 @@ class ParamDescReadAPI {
 
 class ParamDescWriteAPI {
  public:
-  virtual void SetName(const std::string &name) { NotImplemented(); }
-  virtual void SetDim(const std::vector<int64_t> &dim) { NotImplemented(); }
-  virtual void SetDataType(VarDataType data_type) { NotImplemented(); }
-  virtual void SetData(const void *data, size_t byte_size) { NotImplemented(); }
+  virtual void SetName(const std::string &name) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual void SetDim(const std::vector<int64_t> &dim) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual void SetDataType(VarDataType data_type) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual void SetData(const void *data, size_t byte_size) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
 
   virtual ~ParamDescWriteAPI() = default;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL) << "ParamDescWriteAPI is not available in model read-only mode.";
-  }
 };
 
 class CombinedParamsDescReadAPI {
@@ -57,16 +60,10 @@ class CombinedParamsDescReadAPI {
 class CombinedParamsDescWriteAPI {
  public:
   virtual ParamDescWriteAPI *AddParamDesc() {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
   virtual ~CombinedParamsDescWriteAPI() = default;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL) << "CombinedParamsDescWriteAPI is not available in model "
-                  "read-only mode.";
-  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/program_desc.h b/lite/model_parser/base/program_desc.h
index 9ca128bd0aa8ba39752247074e8d57c0d23513f3..c34cc2270467bcab8adedbf4ac520619e74297e6 100644
--- a/lite/model_parser/base/program_desc.h
+++ b/lite/model_parser/base/program_desc.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "lite/model_parser/base/traits.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -36,22 +37,18 @@ class ProgramDescReadAPI {
 
 class ProgramDescWriteAPI {
  public:
-  virtual void ClearBlocks() { NotImplemented(); }
-  virtual void SetVersion(int64_t version) { NotImplemented(); }
+  virtual void ClearBlocks() { LITE_MODEL_INTERFACE_NOT_IMPLEMENTED; }
+  virtual void SetVersion(int64_t version) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
 
   template <typename T>
   T* AddBlock() {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
   virtual ~ProgramDescWriteAPI() = default;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL)
-        << "ProgramDescWriteAPI is not available in model read-only mode.";
-  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/traits.h b/lite/model_parser/base/traits.h
index 275bb21cb69938a1addee547ed625d870c616297..4ec728b122a20a2e4b8defe6802a1af5f1c74ccd 100644
--- a/lite/model_parser/base/traits.h
+++ b/lite/model_parser/base/traits.h
@@ -19,6 +19,10 @@
 #include "lite/api/paddle_place.h"
 #include "lite/utils/cp_logging.h"
 
+#define LITE_MODEL_INTERFACE_NOT_IMPLEMENTED                \
+  LOG(FATAL) << "This additional interface is temporarily " \
+                "unavailable in flatbuffers read-only mode."
+
 namespace paddle {
 namespace lite {
 
diff --git a/lite/model_parser/base/var_desc.h b/lite/model_parser/base/var_desc.h
index fa5c89b8c7baec1f3ec79555e2e17af92120cfd2..ec4a7f76e3a2921576286eccc12adc1f455a2f42 100644
--- a/lite/model_parser/base/var_desc.h
+++ b/lite/model_parser/base/var_desc.h
@@ -33,16 +33,19 @@ class VarDescReadAPI {
 
 class VarDescWriteAPI {
  public:
-  virtual void SetName(std::string name) { NotImplemented(); }
-  virtual void SetType(VarDataType type) { NotImplemented(); }
-  virtual void SetPersistable(bool persistable) { NotImplemented(); }
-  virtual void SetShape(const std::vector<int64_t>& dims) { NotImplemented(); }
-  virtual ~VarDescWriteAPI() = default;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL) << "VarDescWriteAPI is not available in model read-only mode.";
+  virtual void SetName(std::string name) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual void SetType(VarDataType type) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
   }
+  virtual void SetPersistable(bool persistable) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual void SetShape(const std::vector<int64_t>& dims) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
+  virtual ~VarDescWriteAPI() = default;
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/compatibility.cc b/lite/model_parser/compatibility.cc
index dd43f7bd25277e34a2fd8b04aae6b705402a0436..955bf6fb681b5d04b54892ccd9a35cc21d6992a3 100644
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
@@ -11,16 +11,15 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/model_parser/compatibility.h"
 
+#ifndef LITE_ON_TINY_PUBLISH
 #include "lite/core/type_system.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/naive_buffer/block_desc.h"
 #include "lite/model_parser/naive_buffer/op_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
 #include "lite/model_parser/naive_buffer/var_desc.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/model_parser/cpp_desc.h"
 #endif
 
 namespace paddle {
diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc
index a679d815225d222f224351ab2177b07e37176781..826e46e7d101c565263c72061383305a25c82854 100644
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
@@ -15,12 +15,12 @@
 #include "lite/model_parser/compatible_pb.h"
 #include <string>
 #include <vector>
+#ifndef LITE_ON_TINY_PUBLISH
+#include "lite/model_parser/flatbuffers/program_desc.h"
 #include "lite/model_parser/naive_buffer/block_desc.h"
 #include "lite/model_parser/naive_buffer/op_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
 #include "lite/model_parser/naive_buffer/var_desc.h"
-#ifndef LITE_ON_TINY_PUBLISH
-#include "lite/model_parser/flatbuffers/program_desc.h"
 #include "lite/model_parser/pb/block_desc.h"
 #include "lite/model_parser/pb/op_desc.h"
 #include "lite/model_parser/pb/program_desc.h"
@@ -67,7 +67,6 @@ void TransformVarDescAnyToCpp<fbs::VarDesc>(const fbs::VarDesc &any_desc,
     cpp_desc->SetShape(any_desc.GetShape());
   }
 }
-#endif
 
 template <>
 void TransformVarDescAnyToCpp<naive_buffer::VarDesc>(
@@ -84,7 +83,7 @@ void TransformVarDescAnyToCpp<naive_buffer::VarDesc>(
       cpp_desc->SetShape(any_desc.GetShape());
     }*/
 }
-
+#endif
 /// For OpDesc transform
 template <typename OpDescType>
 void OpInputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
@@ -312,12 +311,11 @@ void OpAttrsCppToAny(const cpp::OpDesc &cpp_desc, OpDescType *any_desc) {
     }                                                                         \
   }
 
+#ifndef LITE_ON_TINY_PUBLISH
 TRANS_VAR_ANY_WITH_CPP_IMPL(naive_buffer::VarDesc);
 TRANS_OP_ANY_WITH_CPP_IMPL(naive_buffer::OpDesc);
 TRANS_BLOCK_ANY_WITH_CPP_IMPL(OpDesc, VarDesc, naive_buffer, naive_buffer);
 TRANS_PROGRAM_ANY_WITH_CPP_IMPL(BlockDesc, naive_buffer, naive_buffer);
-
-#ifndef LITE_ON_TINY_PUBLISH
 TRANS_VAR_ANY_WITH_CPP_IMPL(fbs::VarDesc);
 TRANS_OP_ANY_WITH_CPP_IMPL(fbs::OpDesc);
 TRANS_BLOCK_ANY_WITH_CPP_IMPL(OpDescT, VarDescT, fbs, fbs);
diff --git a/lite/model_parser/flatbuffers/block_desc.h b/lite/model_parser/flatbuffers/block_desc.h
index 05c77b9691ad77e6680e407f25214471eccea56b..0152d6d9656a34da0abd5194915e9642e4978d2e 100644
--- a/lite/model_parser/flatbuffers/block_desc.h
+++ b/lite/model_parser/flatbuffers/block_desc.h
@@ -51,7 +51,7 @@ class BlockDescView : public BlockDescAPI {
 
   template <typename T>
   T* GetVar(int32_t idx) {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
@@ -66,7 +66,7 @@ class BlockDescView : public BlockDescAPI {
 
   template <typename T>
   T* GetOp(int32_t idx) {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
@@ -82,12 +82,6 @@ class BlockDescView : public BlockDescAPI {
   proto::BlockDesc const* desc_;  // not_own
   std::vector<VarDescView> vars_;
   std::vector<OpDescView> ops_;
-
- private:
-  void NotImplemented() const {
-    LOG(FATAL) << "The additional interfaces of BlockDescView is temporarily "
-                  "unavailable in read-only mode.";
-  }
 };
 
 #ifdef LITE_WITH_FLATBUFFERS_DESC
diff --git a/lite/model_parser/flatbuffers/io.cc b/lite/model_parser/flatbuffers/io.cc
index bcbda0c49403b77601ff8c5f020b80ed7ef23f09..9f01d5e0e513f6711c4dc0ad9231ea9514e43f21 100644
--- a/lite/model_parser/flatbuffers/io.cc
+++ b/lite/model_parser/flatbuffers/io.cc
@@ -23,12 +23,20 @@ namespace paddle {
 namespace lite {
 namespace fbs {
 
-std::vector<char> LoadFile(const std::string& path) {
+std::vector<char> LoadFile(const std::string& path,
+                           const size_t& offset,
+                           const size_t& size) {
+  // open file in readonly mode
   FILE* file = fopen(path.c_str(), "rb");
-  CHECK(file);
-  fseek(file, 0, SEEK_END);
-  uint64_t length = ftell(file);
-  rewind(file);
+  CHECK(file) << "Unable to open file: " << path;
+  // move fstream pointer backward for offset
+  uint64_t length = size;
+  if (size == 0) {
+    fseek(file, 0L, SEEK_END);
+    length = ftell(file) - offset;
+  }
+  fseek(file, offset, SEEK_SET);
+  // read data of `length` into buf
   std::vector<char> buf(length);
   CHECK_EQ(fread(buf.data(), 1, length, file), length);
   fclose(file);
diff --git a/lite/model_parser/flatbuffers/io.h b/lite/model_parser/flatbuffers/io.h
index e0bd9195c26e801a9c092d6474a0d5d50b1e289a..1ef6b0d6d13833ea3589594749459d9d14835b6a 100644
--- a/lite/model_parser/flatbuffers/io.h
+++ b/lite/model_parser/flatbuffers/io.h
@@ -26,7 +26,9 @@ namespace paddle {
 namespace lite {
 namespace fbs {
 
-std::vector<char> LoadFile(const std::string& path);
+std::vector<char> LoadFile(const std::string& path,
+                           const size_t& offset = 0,
+                           const size_t& size = 0);
 void SaveFile(const std::string& path, const std::vector<char>& cache);
 
 void SetScopeWithCombinedParams(lite::Scope* scope,
diff --git a/lite/model_parser/flatbuffers/io_test.cc b/lite/model_parser/flatbuffers/io_test.cc
index 19d586322e8016e6d0280e12a69d92a7b4c712c0..1fdd700358064f668c3c68bea4b6a3cecc4940c3 100644
--- a/lite/model_parser/flatbuffers/io_test.cc
+++ b/lite/model_parser/flatbuffers/io_test.cc
@@ -43,7 +43,7 @@ void set_tensor(paddle::lite::Tensor* tensor,
 TEST(CombinedParamsDesc, Scope) {
   /* --------- Save scope ---------- */
   Scope scope;
-  std::vector<std::string> params_name({"var_0", "var_1"});
+  std::vector<std::string> params_name({"var_0", "var_1", "var_2"});
   // variable 0
   Variable* var_0 = scope.Var(params_name[0]);
   Tensor* tensor_0 = var_0->GetMutable<Tensor>();
@@ -52,6 +52,10 @@ TEST(CombinedParamsDesc, Scope) {
   Variable* var_1 = scope.Var(params_name[1]);
   Tensor* tensor_1 = var_1->GetMutable<Tensor>();
   set_tensor<int8_t>(tensor_1, std::vector<int64_t>({10, 1}));
+  // variable 3
+  Variable* var_2 = scope.Var(params_name[2]);
+  Tensor* tensor_2 = var_2->GetMutable<Tensor>();
+  set_tensor<int16_t>(tensor_2, std::vector<int64_t>({16, 1}));
   // Set combined parameters
   fbs::CombinedParamsDesc combined_param;
   std::set<std::string> params_set(params_name.begin(), params_name.end());
@@ -71,6 +75,11 @@ TEST(CombinedParamsDesc, Scope) {
     CHECK(var_l1);
     const Tensor& tensor_l1 = var_l1->Get<Tensor>();
     CHECK(TensorCompareWith(*tensor_1, tensor_l1));
+    // variable 2
+    Variable* var_l2 = scope_l.FindVar(params_name[2]);
+    CHECK(var_l2);
+    const Tensor& tensor_l2 = var_l2->Get<Tensor>();
+    CHECK(TensorCompareWith(*tensor_2, tensor_l2));
   };
   check_params(combined_param);
 
diff --git a/lite/model_parser/flatbuffers/op_desc.h b/lite/model_parser/flatbuffers/op_desc.h
index 23bd00e8c01d52bafb69b9faf5942339756eefec..1194f0df58c23422e0e7c5252916876f9bce9611 100644
--- a/lite/model_parser/flatbuffers/op_desc.h
+++ b/lite/model_parser/flatbuffers/op_desc.h
@@ -154,19 +154,19 @@ class OpDescView : public OpDescAPI {
   }
 
   const std::map<std::string, std::vector<std::string>>& inputs() const {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return inputs_;
   }
   const std::map<std::string, std::vector<std::string>>& outputs() const {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return outputs_;
   }
   std::map<std::string, std::vector<std::string>>* mutable_inputs() {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return &inputs_;
   }
   std::map<std::string, std::vector<std::string>>* mutable_outputs() {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return &outputs_;
   }
 
@@ -183,7 +183,7 @@ class OpDescView : public OpDescAPI {
   }
 
   std::vector<std::string> output_vars() const {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return std::vector<std::string>();
   }
 
@@ -192,19 +192,15 @@ class OpDescView : public OpDescAPI {
   }
 
   const std::map<std::string, Any>& attrs() const {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return attrs_;
   }
   const std::map<std::string, AttrType>& attr_types() const {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return attr_types_;
   }
 
  private:
-  void NotImplemented() const {
-    LOG(FATAL) << "The additional interfaces of OpDescView is temporarily "
-                  "unavailable in read-only mode.";
-  }
   std::string type_;
   std::map<std::string, std::vector<std::string>> inputs_;
   std::map<std::string, std::vector<std::string>> outputs_;
diff --git a/lite/model_parser/flatbuffers/program_desc.h b/lite/model_parser/flatbuffers/program_desc.h
index 0f50be1a9ab6ee1d5e3eb1f0fdfb592f3c8f3869..0535b7f5272046d30f9ef3f30ad0ec14b661ea34 100644
--- a/lite/model_parser/flatbuffers/program_desc.h
+++ b/lite/model_parser/flatbuffers/program_desc.h
@@ -48,7 +48,7 @@ class ProgramDescView : public ProgramDescAPI {
 
   void InitProgramDesc() {
     desc_ = proto::GetProgramDesc(buf_.data());
-    blocks_.resize(BlocksSize());
+    blocks_.resize(desc_->blocks()->size());
     for (size_t idx = 0; idx < BlocksSize(); ++idx) {
       blocks_[idx] = BlockDescView(desc_->blocks()->Get(idx));
     }
@@ -59,14 +59,14 @@ class ProgramDescView : public ProgramDescAPI {
     Init(buf_);
   }
 
-  size_t BlocksSize() const override { return desc_->blocks()->size(); }
+  size_t BlocksSize() const override { return blocks_.size(); }
 
   template <typename T>
   T const* GetBlock(int32_t idx) const;
 
   template <typename T>
   T* GetBlock(int32_t idx) {
-    NotImplemented();
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
     return nullptr;
   }
 
@@ -79,6 +79,13 @@ class ProgramDescView : public ProgramDescAPI {
     return desc_->version()->version();
   }
 
+  void ClearBlocks() override {
+    CHECK_EQ(BlocksSize(), 0u) << "For backward compatibility, in the "
+                                  "read-only flatbuffers version, this "
+                                  "interface degenerates to force the number "
+                                  "of blocks to be zero.";
+  }
+
   proto::ProgramDesc const* raw_desc() const { return desc_; }
 
   const std::vector<char>& buf() const { return buf_; }
@@ -91,10 +98,6 @@ class ProgramDescView : public ProgramDescAPI {
  private:
   ProgramDescView& operator=(const ProgramDescView&) = delete;
   ProgramDescView(const ProgramDescView&) = delete;
-  void NotImplemented() const {
-    LOG(FATAL) << "The additional interfaces of ProgramDescView is temporarily "
-                  "unavailable in read-only mode.";
-  }
 };
 
 #ifdef LITE_WITH_FLATBUFFERS_DESC
diff --git a/lite/model_parser/flatbuffers/var_desc.h b/lite/model_parser/flatbuffers/var_desc.h
index 981b7bbbe44b19894077660c38dae46305b1f32e..f32383d8a2221e6dbc2b11f2e7d6600dcfafec77 100644
--- a/lite/model_parser/flatbuffers/var_desc.h
+++ b/lite/model_parser/flatbuffers/var_desc.h
@@ -67,14 +67,12 @@ class VarDescView : public VarDescAPI {
 
  public:
   VarDescView() = default;
-  void SetDataType(Type data_type) { NotImplemented(); }
-  void SetShape(const std::vector<int64_t>& dims) { NotImplemented(); }
+  void SetDataType(Type data_type) { LITE_MODEL_INTERFACE_NOT_IMPLEMENTED; }
+  void SetShape(const std::vector<int64_t>& dims) {
+    LITE_MODEL_INTERFACE_NOT_IMPLEMENTED;
+  }
 
  private:
-  void NotImplemented() const {
-    LOG(FATAL) << "The additional interfaces of VarDescView is temporarily "
-                  "unavailable in read-only mode.";
-  }
   std::vector<int64_t> shape_;
 };
 
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 50aaf038fe43d5e433a4a598651615c3c3250585..145d366b71c0875a4d344bff3cecd474b8aa7a0f 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -24,11 +24,11 @@
 #include "lite/core/version.h"
 #include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/flatbuffers/io.h"
+#ifndef LITE_ON_TINY_PUBLISH
 #include "lite/model_parser/naive_buffer/combined_params_desc.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
 #include "lite/model_parser/naive_buffer/var_desc.h"
-#ifndef LITE_ON_TINY_PUBLISH
 #include "lite/model_parser/pb/program_desc.h"
 #include "lite/model_parser/pb/var_desc.h"
 #endif
@@ -391,7 +391,7 @@ void TensorToStream(std::ostream &os, const lite::Tensor &tensor) {
   }
   {  // the 3rd field, tensor data
     uint64_t size = tensor.memory_size();
-    CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
+    CHECK_LT(size, (std::numeric_limits<std::streamsize>::max)())
         << "Index overflow when writing tensor";
 
 #ifdef LITE_WITH_CUDA
@@ -461,7 +461,7 @@ void SetParamInfoNaive(naive_buffer::ParamDesc *param_desc,
   }
   desc.SetDim(tensor.dims().Vectorize());
   uint64_t size = tensor.memory_size();
-  CHECK_LT(size, std::numeric_limits<std::streamsize>::max())
+  CHECK_LT(size, (std::numeric_limits<std::streamsize>::max)())
       << "Index overflow when writing tensor";
 
 #ifdef LITE_WITH_CUDA
@@ -546,64 +546,57 @@ void SaveCombinedParamsNaive(const std::string &path,
   table.AppendToFile(path);
 }
 
-void SaveModelNaive(const std::string &model_dir,
+////////////////////////////////////////////////////////////////////////////////////
+// Save model: meta_version = 1
+// Flatbuffer model + params
+////////////////////////////////////////////////////////////////////////////////////
+// Create a new file and write data into it.
+void WriteToFile(const std::string &filename,
+                 const void *src,
+                 size_t byte_size) {
+  CHECK(src);
+  FILE *file = fopen(filename.c_str(), "wb");
+  CHECK(file);
+  CHECK(fwrite(src, sizeof(char), byte_size, file) == byte_size);
+  fclose(file);
+}
+// Append data into an existed file.
+void AppendToFile(const std::string &filename,
+                  const void *src,
+                  size_t byte_size) {
+  CHECK(src);
+  FILE *fp = fopen(filename.c_str(), "ab");
+  CHECK(fp) << "Unable to open file: " << filename;
+  if (fwrite(reinterpret_cast<const char *>(src), 1, byte_size, fp) !=
+      byte_size) {
+    fclose(fp);
+    LOG(FATAL) << "Write file error: " << filename;
+  }
+  fclose(fp);
+}
+/* ---------- Flatbuffers ---------- */
+void SaveModelNaive(const std::string &model_file,
                     const Scope &exec_scope,
-                    const cpp::ProgramDesc &cpp_prog,
-                    bool combined) {
-  // Save program
-  const std::string prog_path = model_dir + ".nb";
-  naive_buffer::BinaryTable table;
-  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
-  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
-  TransformProgramDescCppToAny(cpp_prog, &nb_prog);
-  nb_proto_prog.Save();
-
+                    const cpp::ProgramDesc &cpp_prog) {
+  /* 1. Save model to model.fbs */
+  const std::string prog_path = model_file + ".nb";
   // Save meta_version(uint16) into file
-  naive_buffer::BinaryTable meta_version_table;
-  meta_version_table.Require(sizeof(uint16_t));
-  uint16_t meta_version = 0;
-  memcpy(meta_version_table.cursor(), &meta_version, sizeof(uint16_t));
-  meta_version_table.Consume(sizeof(uint16_t));
-  meta_version_table.SaveToFile(prog_path);
+  uint16_t meta_version = 1;
+  WriteToFile(prog_path, &meta_version, sizeof(uint16_t));
 
   // Save lite_version(char[16]) into file
   const int paddle_version_length = 16 * sizeof(char);
-  naive_buffer::BinaryTable paddle_version_table;
-  paddle_version_table.Require(paddle_version_length);
   std::string paddle_version = version();
-  memcpy(paddle_version_table.cursor(),
-         paddle_version.c_str(),
-         paddle_version_length);
-  paddle_version_table.Consume(paddle_version_length);
-  paddle_version_table.AppendToFile(prog_path);
+  AppendToFile(prog_path, paddle_version.c_str(), paddle_version_length);
   VLOG(4) << "paddle_version:" << paddle_version;
 
-  // Save topology_size(uint64) into file
-  naive_buffer::BinaryTable topology_size_table;
-  topology_size_table.Require(sizeof(uint64_t));
-  uint64_t topology_size = table.size();
-  memcpy(topology_size_table.cursor(), &topology_size, sizeof(uint64_t));
-  topology_size_table.Consume(sizeof(uint64_t));
-  topology_size_table.AppendToFile(prog_path);
-
-  // save topology data into model file
-  table.AppendToFile(prog_path);
-  // Save Params
-  SaveCombinedParamsNaive(prog_path, exec_scope, cpp_prog);
-
-  LOG(INFO) << "Save naive buffer model in '" << model_dir
-            << ".nb' successfully";
-}
-
-/* ---------- Flatbuffers ---------- */
-void SaveModelFbs(const std::string &model_dir,
-                  const Scope &exec_scope,
-                  const cpp::ProgramDesc &cpp_prog) {
-  /* 1. Save model to model.fbs */
-  const std::string prog_path = model_dir + "/model.fbs";
   fbs::ProgramDesc fbs_prog;
   TransformProgramDescCppToAny(cpp_prog, &fbs_prog);
-  fbs::SaveFile(prog_path, fbs_prog.data());
+  uint64_t topology_size = (fbs_prog.data()).size();
+  AppendToFile(prog_path, &topology_size, sizeof(uint64_t));
+  /* 1. Save model to model.fbs */
+  AppendToFile(prog_path, (fbs_prog.data()).data(), topology_size);
+  VLOG(4) << "save topology_size:" << topology_size;
 
   /* 2. Get param names from cpp::ProgramDesc */
   auto &main_block_desc = *cpp_prog.GetBlock<cpp::BlockDesc>(0);
@@ -618,35 +611,12 @@ void SaveModelFbs(const std::string &model_dir,
   }
 
   /* 3. Save combined params to params.fbs */
-  const std::string params_path = model_dir + "/params.fbs";
   fbs::CombinedParamsDesc params_prog;
   fbs::SetCombinedParamsWithScope(exec_scope, unique_var_names, &params_prog);
-  fbs::SaveFile(params_path, params_prog.data());
-}
-#endif  // LITE_ON_TINY_PUBLISH
+  AppendToFile(
+      prog_path, (params_prog.data()).data(), (params_prog.data()).size());
 
-void LoadModelFbsFromFile(const std::string &filename,
-                          Scope *scope,
-                          cpp::ProgramDesc *cpp_prog) {
-  CHECK(cpp_prog);
-  CHECK(scope);
-
-  /* 1. Load cpp::ProgramDesc with model.fbs */
-  const std::string prog_path = filename + "/model.fbs";
-#ifdef LITE_ON_FLATBUFFERS_DESC_VIEW
-  cpp_prog->Init(fbs::LoadFile(prog_path));
-#elif LITE_ON_TINY_PUBLISH
-  LOG(FATAL) << "Since no data structure of Flatbuffers has been constructed, "
-                "the model cannot be loaded.";
-#else
-  fbs::ProgramDesc program(fbs::LoadFile(prog_path));
-  TransformProgramDescAnyToCpp(program, cpp_prog);
-#endif
-
-  /* 2. Load scope with params.fbs */
-  const std::string params_path = filename + "/params.fbs";
-  fbs::CombinedParamsDescView params(fbs::LoadFile(params_path));
-  fbs::SetScopeWithCombinedParams(scope, params);
+  LOG(INFO) << "Save naive buffer model in '" << prog_path << " successfully";
 }
 
 template <typename T>
@@ -747,6 +717,10 @@ void LoadCombinedParamsNaive(const std::string &path,
   }
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/* Old Method of loading and saving model, before V2.3.0                     */
+/* Warning: this is an old inference and will be abandened in release/v3.0.0 */
+///////////////////////////////////////////////////////////////////////////////
 void LoadModelNaive(const std::string &model_dir,
                     Scope *scope,
                     cpp::ProgramDesc *cpp_prog,
@@ -802,6 +776,43 @@ void LoadModelNaive(const std::string &model_dir,
   VLOG(4) << "Load naive buffer model in '" << model_dir << "' successfully";
 }
 
+void LoadModelNaiveFromMemory(const std::string &model_buffer,
+                              const std::string &param_buffer,
+                              Scope *scope,
+                              cpp::ProgramDesc *cpp_prog) {
+  CHECK(cpp_prog);
+  CHECK(scope);
+  cpp_prog->ClearBlocks();
+
+  // Load model
+  naive_buffer::BinaryTable table;
+  table.LoadFromMemory(model_buffer.c_str(), model_buffer.length());
+
+  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
+  nb_proto_prog.Load();
+  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
+
+  // Transform to cpp::ProgramDesc
+  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
+
+  // Load Params
+  LoadCombinedParamsNaive(param_buffer, 0, scope, *cpp_prog, true);
+
+  VLOG(4) << "Load model from naive buffer memory successfully";
+}
+#endif  // LITE_ON_TINY_PUBLISH
+//////////////////////////////////////////////////////////////////////
+
+// usage: LoadModelNaiveFromFile is used for loading model from file.
+template <typename T>
+void ReadModelDataFromFile(T *data,
+                           const std::string &prog_path,
+                           uint64_t *offset,
+                           const uint64_t &size) {
+  std::vector<char> prog_data = lite::fbs::LoadFile(prog_path, *offset, size);
+  memcpy(data, prog_data.data(), size);
+  *offset = *offset + size;
+}
 /*
  * Binary structure of naive_buffer model: model.nb
  * ----------------------------------------------------------
@@ -820,23 +831,45 @@ void LoadModelNaive(const std::string &model_dir,
  *      param_data:   contains model's params data.
 */
 
-// usage: LoadModelNaiveFromFile is used for loading model from file.
-template <typename T>
-void ReadModelDataFromFile(T *data,
-                           const std::string &prog_path,
-                           uint64_t *offset,
-                           const uint64_t &size) {
-  naive_buffer::BinaryTable data_table;
-  data_table.LoadFromFile(prog_path, *offset, size);
-  memcpy(data, data_table.cursor(), size);
-  *offset = *offset + size;
-}
-
 void LoadModelNaiveFromFile(const std::string &filename,
                             Scope *scope,
                             cpp::ProgramDesc *cpp_prog) {
   CHECK(cpp_prog);
   CHECK(scope);
+  // ModelFile
+  const std::string prog_path = filename;
+
+  // Offset
+  uint64_t offset = 0;
+
+  // (1)get meta version
+  uint16_t meta_version;
+  ReadModelDataFromFile<uint16_t>(
+      &meta_version, prog_path, &offset, sizeof(uint16_t));
+  VLOG(4) << "Meta_version:" << meta_version;
+
+  switch (meta_version) {
+    case 0:
+#ifndef LITE_ON_TINY_PUBLISH
+      LoadModelNaiveV0FromFile(filename, scope, cpp_prog);
+#else
+      LOG(FATAL) << "Error, this model file is not supported.";
+#endif
+      break;
+    case 1:
+      LoadModelFbsFromFile(filename, scope, cpp_prog);
+      break;
+    default:
+      LOG(FATAL) << "Error, this model file is not supported.";
+      break;
+  }
+}
+#ifndef LITE_ON_TINY_PUBLISH
+void LoadModelNaiveV0FromFile(const std::string &filename,
+                              Scope *scope,
+                              cpp::ProgramDesc *cpp_prog) {
+  CHECK(cpp_prog);
+  CHECK(scope);
   cpp_prog->ClearBlocks();
   // ModelFile
   const std::string prog_path = filename;
@@ -889,35 +922,54 @@ void LoadModelNaiveFromFile(const std::string &filename,
 
   VLOG(4) << "Load naive buffer model in '" << filename << "' successfully";
 }
-
-// warning: this is an old inference and is not suggested.
-// todo: this inference will be abandened in release/v3.0.0
-void LoadModelNaiveFromMemory(const std::string &model_buffer,
-                              const std::string &param_buffer,
-                              Scope *scope,
-                              cpp::ProgramDesc *cpp_prog) {
+#endif  // LITE_ON_TINY_PUBLISH
+void LoadModelFbsFromFile(const std::string &filename,
+                          Scope *scope,
+                          cpp::ProgramDesc *cpp_prog) {
   CHECK(cpp_prog);
   CHECK(scope);
-  cpp_prog->ClearBlocks();
-
-  // Load model
-
-  naive_buffer::BinaryTable table;
-  table.LoadFromMemory(model_buffer.c_str(), model_buffer.length());
+  CHECK_EQ(cpp_prog->BlocksSize(), 0);
+  // Offset
+  uint64_t offset = sizeof(uint16_t);
 
-  naive_buffer::proto::ProgramDesc nb_proto_prog(&table);
-  nb_proto_prog.Load();
-  naive_buffer::ProgramDesc nb_prog(&nb_proto_prog);
+  // get opt version
+  char opt_version[16];
+  const uint64_t opt_version_length = 16 * sizeof(char);
+  ReadModelDataFromFile<char>(
+      opt_version, filename, &offset, opt_version_length);
+  VLOG(4) << "Opt_version:" << static_cast<const char *>(opt_version);
+  // check version, opt's version should be consistent with current Paddle-Lite
+  // version.
+  const std::string paddle_version = version();
+  const std::string opt_version_str = opt_version;
+  if (paddle_version != opt_version_str) {
+    LOG(WARNING) << "warning: the version of opt that transformed this model "
+                    "is not consistent with current Paddle-Lite version."
+                    "\n      version of opt:"
+                 << static_cast<const char *>(opt_version)
+                 << "\n      version of current Paddle-Lite:" << paddle_version;
+  }
+  // (3)get topo_size
+  uint64_t topo_size;
+  ReadModelDataFromFile<uint64_t>(
+      &topo_size, filename, &offset, sizeof(uint64_t));
 
-  // Transform to cpp::ProgramDesc
-  TransformProgramDescAnyToCpp(nb_prog, cpp_prog);
+#ifdef LITE_ON_FLATBUFFERS_DESC_VIEW
+  cpp_prog->Init(fbs::LoadFile(filename, offset, topo_size));
+#elif LITE_ON_TINY_PUBLISH
+  LOG(FATAL) << "Since no data structure of Flatbuffers has been constructed, "
+                "the model cannot be loaded.";
+#else
+  fbs::ProgramDesc program(fbs::LoadFile(filename, offset, topo_size));
+  TransformProgramDescAnyToCpp(program, cpp_prog);
+#endif
+  offset = offset + topo_size;
 
-  // Load Params
-  // NOTE: Only main block be used now.
-  // only combined Params are supported in Loading Model from memory
-  LoadCombinedParamsNaive(param_buffer, 0, scope, *cpp_prog, true);
+  /* 2. Load scope from params.fbs */
+  fbs::CombinedParamsDescView params(fbs::LoadFile(filename, offset));
+  fbs::SetScopeWithCombinedParams(scope, params);
 
-  VLOG(4) << "Load model from naive buffer memory successfully";
+  VLOG(4) << "Load naive buffer model in '" << filename << "' successfully";
 }
 
 // usage: LoadModelNaiveFromMemory is used for loading naive model from memory
@@ -926,11 +978,10 @@ void ReadModelDataFromBuffer(T *data,
                              const std::string &model_buffer,
                              uint64_t *offset,
                              const uint64_t &size) {
-  naive_buffer::BinaryTable data_table;
-  data_table.LoadFromMemory(model_buffer.c_str() + *offset, size);
-  memcpy(data, data_table.cursor(), size);
+  memcpy(data, model_buffer.c_str() + *offset, size);
   *offset = *offset + size;
 }
+
 void LoadModelNaiveFromMemory(const std::string &model_buffer,
                               Scope *scope,
                               cpp::ProgramDesc *cpp_prog) {
@@ -938,14 +989,37 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
   CHECK(scope);
   cpp_prog->ClearBlocks();
 
-  // Offset
   uint64_t offset = 0;
-
   // (1)get meta version
   uint16_t meta_version;
   ReadModelDataFromBuffer<uint16_t>(
       &meta_version, model_buffer, &offset, sizeof(uint16_t));
   VLOG(4) << "Meta_version:" << meta_version;
+  switch (meta_version) {
+    case 0:
+#ifndef LITE_ON_TINY_PUBLISH
+      LoadModelNaiveV0FromMemory(model_buffer, scope, cpp_prog);
+#else
+      LOG(FATAL) << "Paddle-Lite v2.7 has upgraded the naive-buffer model "
+                    "format. Please use the OPT to generate a new model. "
+                    "Thanks!";
+#endif
+      break;
+    case 1:
+      LoadModelNaiveV1FromMemory(model_buffer, scope, cpp_prog);
+      break;
+    default:
+      LOG(FATAL) << "The model format cannot be recognized. Please make sure "
+                    "you use the correct interface and model file.";
+      break;
+  }
+}
+#ifndef LITE_ON_TINY_PUBLISH
+void LoadModelNaiveV0FromMemory(const std::string &model_buffer,
+                                Scope *scope,
+                                cpp::ProgramDesc *cpp_prog) {
+  // Offset
+  uint64_t offset = sizeof(uint16_t);
 
   // (2)get opt version
   char opt_version[16];
@@ -976,6 +1050,53 @@ void LoadModelNaiveFromMemory(const std::string &model_buffer,
 
   VLOG(4) << "Load model from naive buffer memory successfully";
 }
+#endif
+///////////////////////////////////////////////////////////////////
+// Meta_version=1
+///////////////////////////////////////////////////////////////////
+void LoadModelNaiveV1FromMemory(const std::string &model_buffer,
+                                Scope *scope,
+                                cpp::ProgramDesc *cpp_prog) {
+  // Offset
+  uint64_t offset = sizeof(uint16_t);
+
+  // (2)get opt version
+  char opt_version[16];
+  const uint64_t paddle_version_length = 16 * sizeof(char);
+  ReadModelDataFromBuffer<char>(
+      opt_version, model_buffer, &offset, paddle_version_length);
+  VLOG(4) << "Opt_version:" << static_cast<const char *>(opt_version);
+
+  // (3)get prog_size and prog_data
+  uint64_t prog_size;
+  ReadModelDataFromBuffer<uint64_t>(
+      &prog_size, model_buffer, &offset, sizeof(uint64_t));
+  VLOG(4) << "prog_size:" << prog_size;
+
+  std::vector<char> prog_data(prog_size);
+  memcpy(prog_data.data(), model_buffer.c_str() + offset, prog_size);
+#ifdef LITE_ON_FLATBUFFERS_DESC_VIEW
+  cpp_prog->Init(prog_data);
+#elif LITE_ON_TINY_PUBLISH
+  LOG(FATAL) << "Since no data structure of Flatbuffers has been constructed, "
+                "the model cannot be loaded.";
+#else
+  fbs::ProgramDesc program(prog_data);
+  TransformProgramDescAnyToCpp(program, cpp_prog);
+#endif
+  offset = offset + prog_size;
+  VLOG(4) << "param_size:" << model_buffer.length() - offset;
+
+  std::vector<char> params_data(model_buffer.length() - offset);
+  memcpy(params_data.data(),
+         model_buffer.c_str() + offset,
+         model_buffer.length() - offset);
+
+  fbs::CombinedParamsDescView params(params_data);
+  fbs::SetScopeWithCombinedParams(scope, params);
+
+  VLOG(4) << "Load model from naive buffer memory successfully";
+}
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/model_parser.h b/lite/model_parser/model_parser.h
index 9ca903804558254a209f3149439f43a3ba7a31a4..02c254e909877e4905dea5a46cd5a340ca9d9fdb 100644
--- a/lite/model_parser/model_parser.h
+++ b/lite/model_parser/model_parser.h
@@ -21,11 +21,11 @@
 #include <vector>
 #ifndef LITE_ON_TINY_PUBLISH
 #include "lite/core/framework.pb.h"
+#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 #endif
 #include "lite/core/scope.h"
 #include "lite/core/variable.h"
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
 namespace lite {
@@ -35,6 +35,16 @@ namespace lite {
 std::unique_ptr<framework::proto::ProgramDesc> LoadProgram(
     const std::string& path, bool program_from_memory = false);
 
+template <typename T>
+void ReadModelDataFromFile(T* data,
+                           const std::string& prog_path,
+                           uint64_t* offset,
+                           const uint64_t& size);
+
+void AppendToFile(const std::string& filename,
+                  const void* src,
+                  size_t byte_size);
+
 // Read a single file containing all the parameters.
 void LoadParams(const std::string& path);
 
@@ -86,38 +96,45 @@ void SaveCombinedParamsNaive(const std::string& path,
 
 void SaveModelNaive(const std::string& model_dir,
                     const Scope& exec_scope,
-                    const cpp::ProgramDesc& cpp_prog,
-                    bool combined = true);
+                    const cpp::ProgramDesc& cpp_prog);
 
 void SaveModelFbs(const std::string& model_dir,
                   const Scope& exec_scope,
                   const cpp::ProgramDesc& cpp_prog);
-#endif  // LITE_ON_TINY_PUBLISH
-
-void LoadModelFbsFromFile(const std::string& filename,
-                          Scope* scope,
-                          cpp::ProgramDesc* cpp_prog);
 
 void LoadParamNaive(const std::string& path,
                     lite::Scope* scope,
                     const std::string& name);
-
 // warning:this old inference will be abandened in release/v3.0.0
 // and LoadModelNaiveFromFile is suggested.
 void LoadModelNaive(const std::string& model_dir,
                     lite::Scope* scope,
                     cpp::ProgramDesc* prog,
                     bool combined = true);
-void LoadModelNaiveFromFile(const std::string& filename,
-                            lite::Scope* scope,
-                            cpp::ProgramDesc* prog);
+void LoadModelNaiveV0FromFile(const std::string& filename,
+                              Scope* scope,
+                              cpp::ProgramDesc* cpp_prog);
 void LoadModelNaiveFromMemory(const std::string& model_buffer,
                               const std::string& param_buffer,
                               lite::Scope* scope,
                               cpp::ProgramDesc* cpp_prog);
+void LoadModelNaiveV0FromMemory(const std::string& model_buffer,
+                                Scope* scope,
+                                cpp::ProgramDesc* cpp_prog);
+#endif  // LITE_ON_TINY_PUBLISH
+void LoadModelFbsFromFile(const std::string& filename,
+                          Scope* scope,
+                          cpp::ProgramDesc* cpp_prog);
+
+void LoadModelNaiveFromFile(const std::string& filename,
+                            lite::Scope* scope,
+                            cpp::ProgramDesc* prog);
+
 void LoadModelNaiveFromMemory(const std::string& model_buffer,
                               lite::Scope* scope,
                               cpp::ProgramDesc* cpp_prog);
-
+void LoadModelNaiveV1FromMemory(const std::string& model_buffer,
+                                Scope* scope,
+                                cpp::ProgramDesc* cpp_prog);
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/model_parser_test.cc b/lite/model_parser/model_parser_test.cc
index 16794a525142ad1ad76695dd4aaac003cba32daa..e1d9c14df6bc7e82a7201523cf98f6bc4f82f411 100644
--- a/lite/model_parser/model_parser_test.cc
+++ b/lite/model_parser/model_parser_test.cc
@@ -21,7 +21,6 @@ DEFINE_string(model_dir, "", "");
 
 namespace paddle {
 namespace lite {
-
 TEST(ModelParser, LoadProgram) {
   CHECK(!FLAGS_model_dir.empty());
   auto program = LoadProgram(FLAGS_model_dir + "/__model__");
@@ -117,7 +116,7 @@ TEST(ModelParser, SaveModelNaive) {
   cpp::ProgramDesc prog;
   Scope scope;
   LoadModelPb(FLAGS_model_dir, "", "", &scope, &prog);
-  const std::string save_pb_model_path = FLAGS_model_dir + ".saved.naive";
+  const std::string save_pb_model_path = FLAGS_model_dir + ".saved";
   SaveModelNaive(save_pb_model_path, scope, prog);
 }
 
@@ -126,7 +125,7 @@ TEST(ModelParser, LoadModelNaiveFromFile) {
   cpp::ProgramDesc prog;
   Scope scope;
 
-  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive.nb";
+  auto model_path = std::string(FLAGS_model_dir) + ".saved.nb";
   LoadModelNaiveFromFile(model_path, &scope, &prog);
 }
 
@@ -135,7 +134,7 @@ TEST(ModelParser, LoadModelNaiveFromMemory) {
   cpp::ProgramDesc prog;
   Scope scope;
 
-  auto model_path = std::string(FLAGS_model_dir) + ".saved.naive.nb";
+  auto model_path = std::string(FLAGS_model_dir) + ".saved.nb";
   std::string model_buffer = lite::ReadFile(model_path);
   LoadModelNaiveFromMemory(model_buffer, &scope, &prog);
 }
diff --git a/lite/model_parser/naive_buffer/CMakeLists.txt b/lite/model_parser/naive_buffer/CMakeLists.txt
index b44b817d315adfdb49e86d47924bc1294070f802..4e8311d97c96cce57c50ea6897ababea667c600e 100644
--- a/lite/model_parser/naive_buffer/CMakeLists.txt
+++ b/lite/model_parser/naive_buffer/CMakeLists.txt
@@ -1,3 +1,8 @@
+if (LITE_ON_TINY_PUBLISH)
+  set(naive_wrapper "")
+  return()
+endif()
+
 lite_cc_library(naive_buffer SRCS naive_buffer.cc DEPS types)
 
 add_subdirectory(proto)
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 3a5b856b20746df45392baf68a827ea189605f19..ee3f2f6b670cfa37f9ed24a4cf545c846bcc3a08 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -121,6 +121,7 @@ add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${
 add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})
 add_operator(clip_op extra SRCS clip_op.cc DEPS ${op_DEPS})
 add_operator(print_op extra SRCS print_op.cc DEPS ${op_DEPS})
+add_operator(scatter extra SRCS scatter_op.cc DEPS ${op_DEPS})
 add_operator(matrix_nms_op_lite extra SRCS matrix_nms_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
diff --git a/lite/operators/__xpu__conv2d_op.cc b/lite/operators/__xpu__conv2d_op.cc
index dff4d5e6dadf9bce15e76f5b353611f402eee19a..61d6177f96a7d59246237543255063070272cf03 100644
--- a/lite/operators/__xpu__conv2d_op.cc
+++ b/lite/operators/__xpu__conv2d_op.cc
@@ -133,12 +133,12 @@ bool XPUConv2dOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
       scope->FindVar(op_desc.Output("OutputMax").front())->GetMutable<Tensor>();
 
   param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
   auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
   param_.dilations = std::make_shared<std::vector<int>>(dilations);
   param_.groups = op_desc.GetAttr<int>("groups");
   if (op_desc.HasAttr("act_type")) {
-    param_.act_type = op_desc.GetAttr<int>("act_type");
+    param_.act_type = op_desc.GetAttr<std::string>("act_type");
   }
 
   if (op_desc.HasAttr("filter_type")) {
diff --git a/lite/operators/__xpu__resnet50_op.cc b/lite/operators/__xpu__resnet50_op.cc
index 02ea6dc1799baaab486b839a4d3137020a9f7a5c..cea8ba667d2d3a3a3794b6cce604c6c7a86de8e8 100644
--- a/lite/operators/__xpu__resnet50_op.cc
+++ b/lite/operators/__xpu__resnet50_op.cc
@@ -62,3 +62,4 @@ bool XPUResNet50Op::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
 }  // namespace paddle
 
 REGISTER_LITE_OP(__xpu__resnet50, paddle::lite::operators::XPUResNet50Op);
+REGISTER_LITE_OP(__xpu__resnet50_d, paddle::lite::operators::XPUResNet50Op);
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index 38c59a0290b03031e9cbe013a4a10c14c7ad1743..fa18a384fb1f3cf0cd9f5dd2602f5e95ddf2747d 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -62,7 +62,7 @@ void UpdatePaddingAndDilation(std::vector<int>* paddings,
   if (padding_algorithm == "SAME") {
     for (size_t i = 0; i < strides.size(); ++i) {
       int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum = std::max(
+      int pad_sum = (std::max)(
           (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
           (int64_t)0);
       int pad_0 = pad_sum / 2;
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index a1d4e2e8a038046b257b3ab5f936cc4cb2e62c67..38ef1c6878db570d401bafdb0656a368d377eb46 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -133,15 +133,16 @@ class ConvOpLite : public OpLite {
     const OpInfo* op_info = dynamic_cast<const OpInfo*>(&op_desc);
     if (op_info != nullptr && op_info->HasAttr("enable_int8")) {
       param_.enable_int8 = op_info->GetAttr<bool>("enable_int8");
-      auto input_name = op_info->Input("Input").front();
-      auto filter_name = op_info->Input("Filter").front();
-      auto output_name = op_info->Output("Output").front();
-      if (op_info->HasInputScale(input_name))
-        param_.input_scale = op_info->GetInputScale(input_name)[0];
-      if (op_info->HasInputScale(filter_name))
-        param_.weight_scale = op_info->GetInputScale(filter_name);
-      if (op_info->HasOutputScale(output_name)) {
-        param_.output_scale = op_info->GetOutputScale(output_name)[0];
+      auto input_scale_name = "Input0_scale";
+      auto filter_scale_name = "Filter0_scale";
+      auto output_scale_name = "Output0_scale";
+      if (op_info->HasInputScale(input_scale_name, true))
+        param_.input_scale = op_info->GetInputScale(input_scale_name, true)[0];
+      if (op_info->HasInputScale(filter_scale_name, true))
+        param_.weight_scale = op_info->GetInputScale(filter_scale_name, true);
+      if (op_info->HasOutputScale(output_scale_name, true)) {
+        param_.output_scale =
+            op_info->GetOutputScale(output_scale_name, true)[0];
       }
     }
 
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index 5895bb667aa22507d362004627304ecf78e085f1..101b1628810f388c55e8597ce14be12005614017 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -75,7 +75,7 @@ bool ElementwiseOp::InferShapeImpl() const {
       if (x_dims_array[i] == -1 || y_dims_array[i] == -1) {
         out_dims_array[i] = -1;
       } else {
-        out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]);
+        out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]);
       }
     }
     param_.Out->Resize(DDim(out_dims_array));
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index 5d60af4af075ac11b936868ed822a28e55baef6b..e776f747fc1278f0f2cb75d6e379843b78c7e3fc 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -112,15 +112,15 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   const OpInfo* op_info = dynamic_cast<const OpInfo*>(&op_desc);
   if (op_info != nullptr && op_info->HasAttr("enable_int8")) {
     param_.enable_int8 = op_info->GetAttr<bool>("enable_int8");
-    auto input_name = op_info->Input("Input").front();
-    auto weight_name = op_info->Input("W").front();
-    auto out_name = op_info->Output("Out").front();
-    if (op_info->HasInputScale(input_name))
-      param_.input_scale = op_info->GetInputScale(input_name)[0];
-    if (op_info->HasInputScale(weight_name))
-      param_.weight_scale = op_info->GetInputScale(weight_name);
-    if (op_info->HasOutputScale(out_name))
-      param_.output_scale = op_info->GetOutputScale(out_name)[0];
+    auto input_scale_name = "Input0_scale";
+    auto weight_scale_name = "W0_scale";
+    auto out_scale_name = "Out0_scale";
+    if (op_info->HasInputScale(input_scale_name, true))
+      param_.input_scale = op_info->GetInputScale(input_scale_name, true)[0];
+    if (op_info->HasInputScale(weight_scale_name, true))
+      param_.weight_scale = op_info->GetInputScale(weight_scale_name, true);
+    if (op_info->HasOutputScale(out_scale_name, true))
+      param_.output_scale = op_info->GetOutputScale(out_scale_name, true)[0];
   }
   return true;
 }
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 691846d0ba789a1c29601fbc738b6138d0a72253..2fccbb9593f87ceb3c841790373609c1b47178de 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -294,6 +294,16 @@ struct ScaleParam : ParamBase {
   }
 };
 
+// For Scatter OP
+struct ScatterParam : ParamBase {
+  lite::Tensor* x{};
+  lite::Tensor* indexs{};
+  lite::Tensor* updates{};
+  lite::Tensor* output{};
+
+  bool overwrite{true};
+};
+
 // For Softmax op
 struct SoftmaxParam : ParamBase {
   lite::Tensor* x{};
@@ -1852,7 +1862,7 @@ struct XPUConv2dParam : ParamBase {
   lite::Tensor* OutputMax{nullptr};
 
   int groups{1};
-  int act_type{-1};
+  std::string act_type{""};
   std::string filter_type{""};
   std::vector<int> strides;
   std::shared_ptr<std::vector<int>> paddings;
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index 916ed1dd6f036c6c36954622abbbc1361de1b790..4a053438a33418432b36cf3176d2d926241fb54a 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -128,8 +128,8 @@ inline void UpdatePadding(std::vector<int> *paddings,
     for (size_t i = 0; i < strides.size(); ++i) {
       int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
       int pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
-                   (int64_t)0);
+          (std::max)((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                     (int64_t)0);
       int pad_0 = pad_sum / 2;
       int pad_1 = pad_sum - pad_0;
       *(paddings->begin() + i * 2) = pad_0;
diff --git a/lite/operators/scatter_op.cc b/lite/operators/scatter_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20a0dcb6be409c87e828e168321716adf69011e4
--- /dev/null
+++ b/lite/operators/scatter_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/scatter_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ScatterOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+
+bool ScatterOp::InferShapeImpl() const {
+  auto index_dims = param_.indexs->dims();
+  auto update_dims = param_.updates->dims();
+  auto input_dims = param_.x->dims();
+  for (int i = 1; i < update_dims.size(); i++) {
+    CHECK_EQ_OR_FALSE(update_dims[i], input_dims[i]);
+  }
+  CHECK_EQ_OR_FALSE(index_dims.size(), 1L);
+  param_.output->Resize(input_dims);
+  return true;
+}
+
+bool ScatterOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachParam(&param_);
+  auto x = op_desc.Input("X").front();
+  auto indexs = op_desc.Input("Ids").front();
+  auto updates = op_desc.Input("Updates").front();
+  auto output = op_desc.Output("Out").front();
+  if (op_desc.HasAttr("overwrite")) {
+    param_.overwrite = op_desc.GetAttr<bool>("overwrite");
+  } else {
+    param_.overwrite = true;
+  }
+  param_.x = scope->FindVar(x)->GetMutable<Tensor>();
+  param_.indexs = scope->FindVar(indexs)->GetMutable<Tensor>();
+  param_.updates = scope->FindVar(updates)->GetMutable<Tensor>();
+  param_.output = scope->FindMutableTensor(output);
+
+  CHECK(param_.x);
+  CHECK(param_.indexs);
+  CHECK(param_.updates);
+  CHECK(param_.output);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(scatter, paddle::lite::operators::ScatterOp);
diff --git a/lite/operators/scatter_op.h b/lite/operators/scatter_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..419a5308ef76ee99987945dffb50549ca6bd4842
--- /dev/null
+++ b/lite/operators/scatter_op.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ScatterOp : public OpLite {
+ public:
+  ScatterOp() {}
+  explicit ScatterOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "Scatter"; }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
+    ch->input_shape = ch->DimToStr(param_.x->dims());
+    ch->output_shape = ch->DimToStr(param_.output->dims());
+    ch->macs = param_.x->numel() * 1.f;
+  }
+#endif
+
+ private:
+  mutable ScatterParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index 9757015848e542b7c96c24fc8c5b3b0313d73eaa..be480569289a18cfaf52c511dcb2edbf878158e7 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -51,9 +51,9 @@ bool SliceOp::InferShapeImpl() const {
       if (dim_value > 0) {
         start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
         end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim_value);
+        start = (std::max)(start, 0);
+        end = (std::max)(end, 0);
+        end = (std::min)(end, dim_value);
         out_dims[axes[i]] = end - start;
       }
     }
diff --git a/lite/tests/CMakeLists.txt b/lite/tests/CMakeLists.txt
index a94a46897a8ae8415efd8edf19e216ede69f8888..d1ea51aebc7b3cd879b102947a5a689c41465954 100644
--- a/lite/tests/CMakeLists.txt
+++ b/lite/tests/CMakeLists.txt
@@ -1,4 +1,6 @@
 add_subdirectory(kernels)
 add_subdirectory(math)
 add_subdirectory(cv)
+add_subdirectory(cv/anakin)
 add_subdirectory(api)
+add_subdirectory(benchmark)
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
index be9e7192b7d0d1009a4f48fb7033bdbdfd0c4f10..795b195a03e6dac8366f8b05f52984983c10676d 100644
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -1,52 +1,52 @@
 if(LITE_WITH_ARM)
     lite_cc_test(test_transformer_with_mask_fp32_arm SRCS test_transformer_with_mask_fp32_arm.cc
-      DEPS ${lite_model_test_DEPS} paddle_api_full
-      ARM_DEPS ${arm_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL)
-  if(WITH_TESTING)
-      add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz)
-  endif()
+        DEPS ${lite_model_test_DEPS} paddle_api_full
+        ARM_DEPS ${arm_kernels}
+        ARGS --model_dir=${LITE_MODEL_DIR}/transformer_with_mask_fp32 SERIAL)
+    if(WITH_TESTING)
+        add_dependencies(test_transformer_with_mask_fp32_arm extern_lite_download_transformer_with_mask_fp32_tar_gz)
+    endif()
 endif()
 
-if(LITE_WITH_XPU AND NOT LITE_WITH_XTCL)
-    lite_cc_test(test_resnet50_fp32_xpu SRCS test_resnet50_fp32_xpu.cc
-      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
-    lite_cc_test(test_ernie_fp32_xpu SRCS test_ernie_fp32_xpu.cc
-      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/ernie)
-    lite_cc_test(test_bert_fp32_xpu SRCS test_bert_fp32_xpu.cc
-      DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/bert)
+function(xpu_x86_without_xtcl_test TARGET MODEL DATA)
+    if(${DATA} STREQUAL "")
+        lite_cc_test(${TARGET} SRCS ${TARGET}.cc
+            DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+            ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+            ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL})
+    else()
+        lite_cc_test(${TARGET} SRCS ${TARGET}.cc
+            DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+            ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+            ARGS --model_dir=${LITE_MODEL_DIR}/${MODEL} --data_dir=${LITE_MODEL_DIR}/${DATA})
+    endif()
+
     if(WITH_TESTING)
-        add_dependencies(test_resnet50_fp32_xpu extern_lite_download_resnet50_tar_gz)
-        add_dependencies(test_ernie_fp32_xpu extern_lite_download_ernie_tar_gz)
-        add_dependencies(test_bert_fp32_xpu extern_lite_download_bert_tar_gz)
+        add_dependencies(${TARGET} extern_lite_download_${MODEL}_tar_gz)
+        if(NOT ${DATA} STREQUAL "")
+            add_dependencies(${TARGET} extern_lite_download_${DATA}_tar_gz)
+        endif()
     endif()
-    # TODO(miaotianxiang): enable later
-    #lite_cc_test(test_fpr_fp32_xpu SRCS test_fpr_fp32_xpu.cc
-      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
-    #lite_cc_test(test_mmdnn_fp32_xpu SRCS test_mmdnn_fp32_xpu.cc
-      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
-      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+endfunction()
+
+if(LITE_WITH_XPU AND NOT LITE_WITH_XTCL)
+    xpu_x86_without_xtcl_test(test_resnet50_fp32_xpu resnet50 ILSVRC2012_small)
+    xpu_x86_without_xtcl_test(test_googlenet_fp32_xpu GoogLeNet ILSVRC2012_small)
+    xpu_x86_without_xtcl_test(test_vgg19_fp32_xpu VGG19 ILSVRC2012_small)
+    xpu_x86_without_xtcl_test(test_ernie_fp32_xpu ernie bert_data)
+    xpu_x86_without_xtcl_test(test_bert_fp32_xpu bert bert_data)
 endif()
 
 if(LITE_WITH_RKNPU)
     lite_cc_test(test_mobilenetv1_int8_rknpu SRCS test_mobilenetv1_int8_rknpu.cc
-      DEPS ${lite_model_test_DEPS} paddle_api_full
-      RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
-      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+        DEPS ${lite_model_test_DEPS} paddle_api_full
+        RKNPU_DEPS ${rknpu_kernels} ${rknpu_bridges}
+        ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
 endif()
 
 if(LITE_WITH_APU)
     lite_cc_test(test_mobilenetv1_int8_apu SRCS test_mobilenetv1_int8_apu.cc
-      DEPS ${lite_model_test_DEPS} paddle_api_full
-      APU_DEPS ${apu_kernels} ${apu_bridges}
-      ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
+        DEPS ${lite_model_test_DEPS} paddle_api_full
+        APU_DEPS ${apu_kernels} ${apu_bridges}
+        ARGS --model_dir=${LITE_MODEL_DIR}/MobilenetV1_full_quant SERIAL)
 endif()
diff --git a/lite/tests/api/ILSVRC2012_utility.h b/lite/tests/api/ILSVRC2012_utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8cf478cf35e72224018172d44ce9f42e8c06603
--- /dev/null
+++ b/lite/tests/api/ILSVRC2012_utility.h
@@ -0,0 +1,85 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+
+template <class T = float>
+std::vector<std::vector<T>> ReadRawData(
+    const std::string& raw_data_dir,
+    const std::vector<int>& input_shape = {1, 3, 224, 224},
+    int iteration = 100) {
+  std::vector<std::vector<T>> raw_data;
+
+  int image_size = 1;
+  for (size_t i = 1; i < input_shape.size(); i++) {
+    image_size *= input_shape[i];
+  }
+  int input_size = image_size * input_shape[0];
+
+  for (int i = 0; i < iteration; i++) {
+    std::vector<T> one_iter_raw_data;
+    one_iter_raw_data.resize(input_size);
+    T* data = &(one_iter_raw_data.at(0));
+    for (int j = 0; j < input_shape[0]; j++) {
+      std::string raw_data_file_dir =
+          raw_data_dir + std::string("/") +
+          std::to_string(i * input_shape[0] + j + 1);
+      std::ifstream fin(raw_data_file_dir, std::ios::in | std::ios::binary);
+      CHECK(fin.is_open()) << "failed to open file " << raw_data_file_dir;
+      fin.seekg(0, std::ios::end);
+      int file_size = fin.tellg();
+      fin.seekg(0, std::ios::beg);
+      CHECK_EQ(file_size, image_size * sizeof(T) / sizeof(char));
+      fin.read(reinterpret_cast<char*>(data), file_size);
+      fin.close();
+      data += image_size;
+    }
+    raw_data.emplace_back(one_iter_raw_data);
+  }
+
+  return raw_data;
+}
+
+float CalOutAccuracy(const std::vector<std::vector<float>>& out_rets,
+                     const std::string& labels_dir) {
+  std::vector<int> labels;
+  std::vector<int> out_top1;
+  int right_num = 0;
+
+  auto label_lines = ReadLines(labels_dir);
+  for (size_t i = 0; i < out_rets.size(); i++) {
+    int label = std::stoi(Split(label_lines[i], " ")[1]);
+
+    auto out = out_rets[i];
+    auto largest = std::max_element(out.begin(), out.end());
+    int out_top1 = std::distance(out.begin(), largest);
+
+    right_num += (out_top1 == label);
+  }
+
+  return static_cast<float>(right_num) / static_cast<float>(out_rets.size());
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/bert_utility.h b/lite/tests/api/bert_utility.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf6c2c7eb35469bfe9e3539736056c99f95180d9
--- /dev/null
+++ b/lite/tests/api/bert_utility.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_api.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/io.h"
+#include "lite/utils/string.h"
+
+namespace paddle {
+namespace lite {
+
+template <class T = int64_t>
+void ReadRawData(const std::string& input_data_dir,
+                 std::vector<std::vector<T>>* input0,
+                 std::vector<std::vector<T>>* input1,
+                 std::vector<std::vector<T>>* input2,
+                 std::vector<std::vector<T>>* input3,
+                 std::vector<std::vector<int64_t>>* input_shapes) {
+  auto lines = ReadLines(input_data_dir);
+  for (auto line : lines) {
+    std::vector<std::string> shape_and_data = Split(line, ";");
+    std::vector<int64_t> input_shape =
+        Split<int64_t>(Split(shape_and_data[0], ":")[0], " ");
+    input_shapes->emplace_back(input_shape);
+
+    std::vector<T> input0_data =
+        Split<T>(Split(shape_and_data[0], ":")[1], " ");
+    input0->emplace_back(input0_data);
+    std::vector<T> input1_data =
+        Split<T>(Split(shape_and_data[1], ":")[1], " ");
+    input1->emplace_back(input1_data);
+    std::vector<T> input2_data =
+        Split<T>(Split(shape_and_data[2], ":")[1], " ");
+    input2->emplace_back(input2_data);
+    std::vector<T> input3_data =
+        Split<T>(Split(shape_and_data[3], ":")[1], " ");
+    input3->emplace_back(input3_data);
+  }
+}
+
+template <class T = int64_t>
+void FillTensor(const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
+                int tensor_id,
+                const std::vector<int64_t>& tensor_shape,
+                const std::vector<T>& tensor_value) {
+  predictor->GetInput(tensor_id)->Resize(tensor_shape);
+  int64_t tensor_size = 1;
+  for (size_t i = 0; i < tensor_shape.size(); i++) {
+    tensor_size *= tensor_shape[i];
+  }
+  CHECK_EQ(static_cast<size_t>(tensor_size), tensor_value.size());
+  memcpy(predictor->GetInput(tensor_id)->mutable_data<T>(),
+         tensor_value.data(),
+         sizeof(T) * tensor_size);
+}
+
+float CalBertOutAccuracy(const std::vector<std::vector<float>>& out,
+                         const std::string& out_file) {
+  auto lines = ReadLines(out_file);
+  std::vector<std::vector<float>> ref_out;
+  for (auto line : lines) {
+    ref_out.emplace_back(Split<float>(line, " "));
+  }
+
+  int right_num = 0;
+  for (size_t i = 0; i < out.size(); i++) {
+    std::vector<size_t> out_index{0, 1, 2};
+    std::vector<size_t> ref_out_index{0, 1, 2};
+
+    std::sort(out_index.begin(),
+              out_index.end(),
+              [&out, i](size_t a, size_t b) { return out[i][a] > out[i][b]; });
+    std::sort(ref_out_index.begin(),
+              ref_out_index.end(),
+              [&ref_out, i](size_t a, size_t b) {
+                return ref_out[i][a] > ref_out[i][b];
+              });
+    right_num += (out_index == ref_out_index);
+  }
+
+  return static_cast<float>(right_num) / static_cast<float>(out.size());
+}
+
+float CalErnieOutAccuracy(const std::vector<std::vector<float>>& out,
+                          const std::string& out_file) {
+  auto lines = ReadLines(out_file);
+  std::vector<std::vector<float>> ref_out;
+  for (auto line : lines) {
+    ref_out.emplace_back(Split<float>(line, " "));
+  }
+
+  int right_num = 0;
+  for (size_t i = 0; i < out.size(); i++) {
+    right_num += (std::fabs(out[i][0] - ref_out[i][0]) < 0.01f);
+  }
+
+  return static_cast<float>(right_num) / static_cast<float>(out.size());
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_bert_fp32_xpu.cc b/lite/tests/api/test_bert_fp32_xpu.cc
index 22591e1c2e8d79b5bb70d1e5126dee959274ff5e..63d8954fb9c4b57330ca137899ad8fc7bbc5275b 100644
--- a/lite/tests/api/test_bert_fp32_xpu.cc
+++ b/lite/tests/api/test_bert_fp32_xpu.cc
@@ -21,23 +21,16 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/bert_utility.h"
 #include "lite/utils/cp_logging.h"
 
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 9, "iteration times to run");
+
 namespace paddle {
 namespace lite {
 
-template <typename T>
-lite::Tensor GetTensorWithShape(std::vector<int64_t> shape) {
-  lite::Tensor ret;
-  ret.Resize(shape);
-  T* ptr = ret.mutable_data<T>();
-  for (int i = 0; i < ret.numel(); ++i) {
-    ptr[i] = (T)1;
-  }
-  return ret;
-}
-
-TEST(Ernie, test_ernie_fp32_xpu) {
+TEST(Bert, test_bert_fp32_xpu) {
   lite_api::CxxConfig config;
   config.set_model_dir(FLAGS_model_dir);
   config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
@@ -46,56 +39,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
   config.set_xpu_workspace_l3_size_per_thread();
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
-  int64_t batch_size = 1;
-  int64_t seq_len = 64;
-  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
-  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
-  predictor->GetInput(0)->Resize(input_shape);
-  predictor->GetInput(1)->Resize(input_shape);
-  predictor->GetInput(2)->Resize(input_shape);
-  predictor->GetInput(3)->Resize(input_shape);
-
-  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
+  std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
+  std::vector<std::vector<int64_t>> input0;
+  std::vector<std::vector<int64_t>> input1;
+  std::vector<std::vector<int64_t>> input2;
+  std::vector<std::vector<int64_t>> input3;
+  std::vector<std::vector<int64_t>> input_shapes;
+  ReadRawData(
+      input_data_file, &input0, &input1, &input2, &input3, &input_shapes);
 
   for (int i = 0; i < FLAGS_warmup; ++i) {
+    std::vector<int64_t> shape = {1, 64, 1};
+    std::vector<int64_t> fill_value(64, 0);
+    for (int j = 0; j < 4; j++) {
+      FillTensor(predictor, j, shape, fill_value);
+    }
     predictor->Run();
   }
 
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (int i = 0; i < FLAGS_iteration; ++i) {
+    FillTensor(predictor, 0, input_shapes[i], input0[i]);
+    FillTensor(predictor, 1, input_shapes[i], input1[i]);
+    FillTensor(predictor, 2, input_shapes[i], input2[i]);
+    FillTensor(predictor, 3, input_shapes[i], input3[i]);
+
+    double start = GetCurrentUS();
     predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 3);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
+            << ", warmup: " << FLAGS_warmup
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
 
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>({0.278893, 0.330888, 0.39022}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 3);
-
-  for (size_t i = 0; i < results.size(); ++i) {
-    for (size_t j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 3e-5);
-    }
-  }
+  std::string ref_out_file = FLAGS_data_dir + std::string("/bert_out.txt");
+  float out_accuracy = CalBertOutAccuracy(out_rets, ref_out_file);
+  ASSERT_GT(out_accuracy, 0.95f);
 }
 
 }  // namespace lite
diff --git a/lite/tests/api/test_ernie_fp32_xpu.cc b/lite/tests/api/test_ernie_fp32_xpu.cc
index ec5b8b2535d44ded5dea0eef6ff9ac179e598eea..864bc922f7ffb1b04612870209e6f19bea0e1111 100644
--- a/lite/tests/api/test_ernie_fp32_xpu.cc
+++ b/lite/tests/api/test_ernie_fp32_xpu.cc
@@ -21,8 +21,12 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/bert_utility.h"
 #include "lite/utils/cp_logging.h"
 
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 9, "iteration times to run");
+
 namespace paddle {
 namespace lite {
 
@@ -46,56 +50,58 @@ TEST(Ernie, test_ernie_fp32_xpu) {
   config.set_xpu_workspace_l3_size_per_thread();
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
-  int64_t batch_size = 1;
-  int64_t seq_len = 64;
-  Tensor sample_input = GetTensorWithShape<int64_t>({batch_size, seq_len, 1});
-  std::vector<int64_t> input_shape{batch_size, seq_len, 1};
-  predictor->GetInput(0)->Resize(input_shape);
-  predictor->GetInput(1)->Resize(input_shape);
-  predictor->GetInput(2)->Resize(input_shape);
-  predictor->GetInput(3)->Resize(input_shape);
-
-  memcpy(predictor->GetInput(0)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(1)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(2)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
-  memcpy(predictor->GetInput(3)->mutable_data<int64_t>(),
-         sample_input.raw_data(),
-         sizeof(int64_t) * batch_size * seq_len);
+  std::string input_data_file = FLAGS_data_dir + std::string("/bert_in.txt");
+  std::vector<std::vector<int64_t>> input0;
+  std::vector<std::vector<int64_t>> input1;
+  std::vector<std::vector<int64_t>> input2;
+  std::vector<std::vector<int64_t>> input3;
+  std::vector<std::vector<int64_t>> input_shapes;
+  ReadRawData(
+      input_data_file, &input0, &input1, &input2, &input3, &input_shapes);
 
   for (int i = 0; i < FLAGS_warmup; ++i) {
+    std::vector<int64_t> shape = {1, 64, 1};
+    std::vector<int64_t> fill_value(64, 0);
+    for (int j = 0; j < 4; j++) {
+      FillTensor(predictor, j, shape, fill_value);
+    }
     predictor->Run();
   }
 
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (int i = 0; i < FLAGS_iteration; ++i) {
+    FillTensor(predictor, 0, input_shapes[i], input0[i]);
+    FillTensor(predictor, 1, input_shapes[i], input1[i]);
+    FillTensor(predictor, 2, input_shapes[i], input2[i]);
+    FillTensor(predictor, 3, input_shapes[i], input3[i]);
+
+    double start = GetCurrentUS();
     predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>({0.108398}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 1);
+            << ", warmup: " << FLAGS_warmup
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
 
-  for (size_t i = 0; i < results.size(); ++i) {
-    for (size_t j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 2e-5);
-    }
-  }
+  std::string ref_out_file = FLAGS_data_dir + std::string("/ernie_out.txt");
+  float out_accuracy = CalErnieOutAccuracy(out_rets, ref_out_file);
+  ASSERT_GT(out_accuracy, 0.95f);
 }
 
 }  // namespace lite
diff --git a/lite/tests/api/test_googlenet_fp32_xpu.cc b/lite/tests/api/test_googlenet_fp32_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..de5979d0b93956ef11ab7f0488527ec876ed580c
--- /dev/null
+++ b/lite/tests/api/test_googlenet_fp32_xpu.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+
+namespace paddle {
+namespace lite {
+
+TEST(GoogLeNet, test_googlenet_fp32_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GT(out_accuracy, 0.57f);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_resnet50_fp32_xpu.cc b/lite/tests/api/test_resnet50_fp32_xpu.cc
index 40414e270a7679411e813fc468d0a86ff5680766..795a8fe5c8965d3f9f6116af47a27be763ecf549 100644
--- a/lite/tests/api/test_resnet50_fp32_xpu.cc
+++ b/lite/tests/api/test_resnet50_fp32_xpu.cc
@@ -21,8 +21,14 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
 #include "lite/utils/cp_logging.h"
 
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+
 namespace paddle {
 namespace lite {
 
@@ -35,52 +41,62 @@ TEST(Resnet50, test_resnet50_fp32_xpu) {
   config.set_xpu_workspace_l3_size_per_thread();
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
-  auto input_tensor = predictor->GetInput(0);
-  std::vector<int64_t> input_shape{1, 3, 224, 224};
-  input_tensor->Resize(input_shape);
-  auto* data = input_tensor->mutable_data<float>();
-  int input_num = 1;
-  for (size_t i = 0; i < input_shape.size(); ++i) {
-    input_num *= input_shape[i];
-  }
-  for (int i = 0; i < input_num; i++) {
-    data[i] = 1;
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
   }
 
   for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
     predictor->Run();
   }
 
-  auto start = GetCurrentUS();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+
+    double start = GetCurrentUS();
     predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
-            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
-            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
-            << " ms in average.";
-
-  std::vector<std::vector<float>> results;
-  results.emplace_back(std::vector<float>(
-      {0.000268651, 0.000174053, 0.000213181, 0.000396771, 0.000591516,
-       0.00018169,  0.000289721, 0.000855934, 0.000732185, 9.2055e-05,
-       0.000220664, 0.00235289,  0.00571265,  0.00357688,  0.00129667,
-       0.000465392, 0.000143775, 0.000211628, 0.000617144, 0.000265033}));
-  auto out = predictor->GetOutput(0);
-  ASSERT_EQ(out->shape().size(), 2);
-  ASSERT_EQ(out->shape()[0], 1);
-  ASSERT_EQ(out->shape()[1], 1000);
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
 
-  int step = 50;
-  for (size_t i = 0; i < results.size(); ++i) {
-    for (size_t j = 0; j < results[i].size(); ++j) {
-      EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
-                  results[i][j],
-                  1e-5);
-    }
-  }
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GT(out_accuracy, 0.6f);
 }
 
 }  // namespace lite
diff --git a/lite/tests/api/test_vgg19_fp32_xpu.cc b/lite/tests/api/test_vgg19_fp32_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71c086dda9f561f9932123de5c20f48979ec9dc0
--- /dev/null
+++ b/lite/tests/api/test_vgg19_fp32_xpu.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/tests/api/ILSVRC2012_utility.h"
+#include "lite/utils/cp_logging.h"
+
+DEFINE_string(data_dir, "", "data dir");
+DEFINE_int32(iteration, 100, "iteration times to run");
+DEFINE_int32(batch, 1, "batch of image");
+DEFINE_int32(channel, 3, "image channel");
+
+namespace paddle {
+namespace lite {
+
+TEST(VGG19, test_vgg19_fp32_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  std::string raw_data_dir = FLAGS_data_dir + std::string("/raw_data");
+  std::vector<int> input_shape{
+      FLAGS_batch, FLAGS_channel, FLAGS_im_width, FLAGS_im_height};
+  auto raw_data = ReadRawData(raw_data_dir, input_shape, FLAGS_iteration);
+
+  int input_size = 1;
+  for (auto i : input_shape) {
+    input_size *= i;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    for (int j = 0; j < input_size; j++) {
+      data[j] = 0.f;
+    }
+    predictor->Run();
+  }
+
+  std::vector<std::vector<float>> out_rets;
+  out_rets.resize(FLAGS_iteration);
+  double cost_time = 0;
+  for (size_t i = 0; i < raw_data.size(); ++i) {
+    auto input_tensor = predictor->GetInput(0);
+    input_tensor->Resize(
+        std::vector<int64_t>(input_shape.begin(), input_shape.end()));
+    auto* data = input_tensor->mutable_data<float>();
+    memcpy(data, raw_data[i].data(), sizeof(float) * input_size);
+
+    double start = GetCurrentUS();
+    predictor->Run();
+    cost_time += GetCurrentUS() - start;
+
+    auto output_tensor = predictor->GetOutput(0);
+    auto output_shape = output_tensor->shape();
+    auto output_data = output_tensor->data<float>();
+    ASSERT_EQ(output_shape.size(), 2UL);
+    ASSERT_EQ(output_shape[0], 1);
+    ASSERT_EQ(output_shape[1], 1000);
+
+    int output_size = output_shape[0] * output_shape[1];
+    out_rets[i].resize(output_size);
+    memcpy(&(out_rets[i].at(0)), output_data, sizeof(float) * output_size);
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", batch: " << FLAGS_batch
+            << ", iteration: " << FLAGS_iteration << ", spend "
+            << cost_time / FLAGS_iteration / 1000.0 << " ms in average.";
+
+  std::string labels_dir = FLAGS_data_dir + std::string("/labels.txt");
+  float out_accuracy = CalOutAccuracy(out_rets, labels_dir);
+  ASSERT_GT(out_accuracy, 0.56f);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/benchmark/CMakeLists.txt b/lite/tests/benchmark/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f0469e44e559b3fa2838c39e77872c3e0f0f7763
--- /dev/null
+++ b/lite/tests/benchmark/CMakeLists.txt
@@ -0,0 +1,7 @@
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU AND NOT LITE_WITH_XPU) AND (LITE_WITH_ARM))
+    lite_cc_test(get_conv_latency SRCS src/get_conv_latency.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_batchnorm_latency SRCS src/get_batchnorm_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_pooling_latency SRCS src/get_pooling_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_fc_latency SRCS src/get_fc_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(get_activation_latency SRCS src/get_activation_latency.cc DEPS ${arm_kernels} ${lite_ops} ${host_kernels})
+endif()
diff --git a/lite/tests/benchmark/README.md b/lite/tests/benchmark/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4aaf43e73a12532d4c76d3b7be2e8a5c5b37f8df
--- /dev/null
+++ b/lite/tests/benchmark/README.md
@@ -0,0 +1,63 @@
+# 运行方式
+```shell
+-- cd Paddle-Lite/lite/tests/benchmark
+-- ./build_benchmark_ops.sh #把build目录下的所有单测可执行文件push到手机上
+在build_benchmark_ops.sh中运行python get_latency_lookup_table.py --ops_path ops.txt  --latency_lookup_table_path latency_lookup_table.txt
+其中ops.txt是输入的网络模型文件， latency_lookup_table.txt是执行lite单测后输出的网络op耗时信息文件。
+```
+# 输入ops.txt格式说明
+-- op_name  [dim0 dim1 dim2 dim3]   (op_param0, op_param1, ...， dtype=xxx)
+   ops.txt每一行有三个字段，第一个字段是op_name, 第二个字段是输入Tensor的input_dims,
+   第三个字段用()括起来，描述该op的parameter.
+   # 注意： 每一个字段之间是以tab来分割的，parameter内的子字段是以逗号来分割的，
+   # 描述tensor维度的[]内的数据之间以空格来分割，不能加逗号和tab.
+   op_name现支持取值为conv/activation/batchnorm/pooling/fc;
+   input_dims描述的是输入tensor格式，支持NCHW 4D等Tensor格式;
+   op_param0,op_param1等字段描述该op的param属性，比如conv op包含ch_out/stride/group/kernel/pad/dilation/flag_bias/flag_act等属性;
+   dtype描述该层op使用的数据类型，支持的合法输入为float/int8_float/int8_int8, 现在conv支持三种数据类型，其他op只支持float一种数据类型.
+   
+   # conv op格式
+   conv  [1 96 112 112] (ch_out=48, stride=1, group=1, kernel=1x1, pad=0, dilation=1, flag_bias=0, flag_act=0, dtype=float)
+   ch_out表示输出channel值， kernel表示卷积核size, 支持的合法取值为1x1/3x3/5x5等, pad表示边界padding的取值， flag_bias表示是否有bias, flag_act表示是否融合激活函数，支持的合法取值为0/1/2/4.
+   
+   # activitation op格式
+   activation  [1 8 64 64] (act_type=relu)
+   act_type表示激活函数类型，合法取值为relu/relu6/leaky_relu/tanh/swish/exp/abs/hard_swish/reciprocal/threshold_relu.
+
+   # batchnorm op格式
+   batchnorm   [1 8 64 64] (epsilon=1e-4f, momentum=0.9f)
+   epsilon表示batchnorm的epsilon参数取值， 默认值为1e-4f;
+   momentum表示batchnorm的momentum参数取值， 默认值为0.9f.
+
+   # pooling op格式
+   pooling  [1 8 64 64] (stride=2, pad=0, kernel=2x2, ceil_mode=0, flag_global=0, exclusive=1, pooling_type=max)
+   stride表示pooling操作的跨度，默认值取2;pad表示边界padding的取值，默认值取0;
+   kernel表示pooling卷积核size, 常见取值为2x2(默认值)；
+   ceil_mode表示pooling是否进行ceil操作，=0表示false(默认值)，否则表示为true;
+   flag_global表示pooling是否在WxH维度进行全局操作，=0表示false(默认值)，否则表示为true;
+   exclusive表示pooling操作时的exclusive取值，=1表示true(默认值)，否则表示为false;
+   pooling_type表示pooling类型，合法取值为max(默认值)/avg.
+
+   # fc op格式
+   fc [1 64]   (flag_bias=1, param_dim=64x1000)
+   flag_bias表示fc op是否有bias，=1(默认值)表示为true, 否则为false;
+   param_dim表示fc op `k x n`的操作维度信息，其中k应与input_dims=[m k]中的k取值保持一致.
+   
+# 输出latency_lookup_table.txt格式说明
+dev_info           core_num thread_num	power_mode	core0 arch	core1 arch	core2 arch	core3 arch	core4 arch	core5 arch	core6 arch	core7 arch
+Hisilicon Kirin980    8       1         	0         ARM_A55  	ARM_A55  	ARM_A55  	ARM_A55  	ARM_A76  	ARM_A76  	ARM_A76  	ARM_A76
+
+op_name   	input_dims	   output_dims	   param_info     min_latency(ms)	  max_latency(ms)	 avg_latency(ms)
+conv      	[1 96 112 112]	[1 48 114 114]	(ch_out=48, stride=1, pad=0, kernel=1x1, group=1, dilation=1, flag_bias=0, flag_act=0, dtype=float) 	3.469     	4.111     	3.52088   
+fc        	[1 64]   	[64 1000] 	(param_dim=64x1000, flag_bias=1, dtype=float)  0.135     	0.176     	0.13779   
+batchnorm 	[1 8 64 64]	[1 8 64 64]	(epsilon=1e-4f, momentum=0.9f, dtype=float)    0.014     	0.178     	0.01679   
+pooling   	[1 8 64 64]	[1 8 32 32]	(stride=2, pad=0, kernel=2x2, ceil_mode=0, flag_global=0, exclusive=0, pooling_type=avg, dtype=float) 	0.009     	0.011     	0.00983   
+activation	[1 8 64 64]	[1 8 64 64]	(act_type=relu, dtype=float)                   0.01      	0.036     	0.01103
+
+-- 第一栏为header信息栏， 包含`dev_info` `arm_v7/v8` `core_num` `thread_num` `power_mode` `core0 arch` ... `core7 arch`字段：
+   `dev_info`表示手机hardware厂家型号信息， `arm_v7/v8`表示armv7还是armv8架构, `core_num`表示cpu核心数， `thread_num`表示设置的运行多线程数，
+   `power_mode`表示cpu绑核方式，
+   `core0 arch`...`core7 arch`表示arm cpu架构信息
+   第二栏为op信息栏， 包含`op_name` `input_dims` `output_dims` `param_info` `min_latency` `max_latency` `avg_latency`字段：
+   其中`output_dims`为该层op根据`input_dims`和`param_info`计算得到的输出tensor维度信息;
+   `min_latency(ms)` `max_latency(ms)` `avg_latency(ms)`为该层op运行得到的min/max/avg耗时信息.
diff --git a/lite/tests/benchmark/build_benchmark_ops.sh b/lite/tests/benchmark/build_benchmark_ops.sh
new file mode 100755
index 0000000000000000000000000000000000000000..e973ef6aeeee35af00949f4c37055ed25f772c56
--- /dev/null
+++ b/lite/tests/benchmark/build_benchmark_ops.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+exe_dir="/data/local/tmp/bin"
+work_dir=$(pwd)
+os=android
+abi=armv8
+lang=gcc
+
+function print_usage {
+    echo "----------------------------------------"
+    echo -e "   ./push2device.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang>"
+    echo -e "--arm_os:\t android, only support android now"
+    echo -e "--arm_abi:\t armv8|armv7"
+    echo -e "--arm_lang:\t gcc|clang"
+    echo -e "make sure directory: PaddleLite/build.lite.${arm_os}.${arm_abi}.${arm_lang} exsits!"
+    echo "----------------------------------------"
+}
+
+function main {
+    for i in "$@"; do
+        case $i in
+            --arm_os=*)
+                os="${i#*=}"
+                shift
+                ;;
+            --arm_abi=*)
+                abi="${i#*=}"
+                shift
+                ;;
+            --arm_lang=*)
+                lang="${i#*=}"
+                shift
+                ;;
+            *)
+                print_usage
+                exit 1
+                ;;
+        esac
+    done
+
+    build_dir=$work_dir/../../../build.lite.${os}.${abi}.${lang}
+    lib_path=$build_dir/lite/tests/benchmark
+    lib_files=$lib_path/get*latency
+
+    adb shell mkdir ${exe_dir}
+    for file in ${lib_files}
+    do
+        adb push ${file} ${exe_dir}
+    done
+}
+
+main $@
+python get_latency_lookup_table.py --arm_v7_v8 ${abi}
diff --git a/lite/tests/benchmark/get_latency_lookup_table.py b/lite/tests/benchmark/get_latency_lookup_table.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ad4ed5a9081a4c50f8423014dc7fdbea404058b
--- /dev/null
+++ b/lite/tests/benchmark/get_latency_lookup_table.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import sys
+import re
+import argparse
+import subprocess
+
+def get_args():
+    """Get arguments.
+
+    Returns:
+        Namespace, arguments.
+    """
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--ops_path', default='ops.txt', help='Input ops path.')
+    parser.add_argument(
+        '--latency_lookup_table_path',
+        default='latency_lookup_table.txt',
+        help='Output ops latency path.')
+    parser.add_argument(
+        '--platform', default='android', help='Platform: android/ios/custom.')
+    parser.add_argument('--threads', type=int, default=1, help='Threads.')
+    parser.add_argument('--power_mode', type=int, default=0, help='PowerMode.')
+    parser.add_argument('--warmup_times', type=int, default=5, 
+        help='Warm up times of op when estimating latency.')
+    parser.add_argument('--repeats_times', type=int, default=100,
+        help='Running times of op when estimating latency.')
+    parser.add_argument('--arm_v7_v8', type=str, default='armv8',
+        help='Indicate arm architecture v7 or v8.')
+    args = parser.parse_args()
+    return args
+
+def check_dev_connect():
+    cmd = 'adb devices | grep device'
+    dev_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out = dev_info.communicate()[0]
+    res = out.decode().find("\tdevice")
+    if res == -1:
+        print("No android device is attached")
+        sys.exit()
+
+def get_dev_info():
+    cmd = 'adb shell "cat /proc/cpuinfo | grep Hardware"'
+    dev_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out = dev_info.communicate()[0]
+    out = out.decode().strip('\n')
+    dev_info = out.strip('Hardware\t:').strip()
+    cmd = 'adb shell "cat /proc/cpuinfo | grep part"'
+    cpu_info = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+    out = cpu_info.communicate()[0]
+    out = (out.decode().strip('\n').split('\n'))
+    core_num = len(out)
+    arch_type = ['UNKNOWN CPU ARCH']*core_num
+    for i, v in enumerate(out):
+        out = v.strip('CPU part').strip().strip(':').strip()
+        if out == '0xd03':
+            arch_type[i] = 'ARM_A53'
+        elif out == '0xd05':
+            arch_type[i] = 'ARM_A55'
+        elif out == '0xd07':
+            arch_type[i] = 'ARM_A57'
+        elif out == '0xd08':
+            arch_type[i] = 'ARM_A72'
+        elif out == '0xd09':
+            arch_type[i] = 'ARM_A73'
+        elif out == '0xd0a':
+            arch_type[i] = 'ARM_A75'
+        elif out == '0xd40':
+            arch_type[i] = 'ARM_A76'
+        elif out == '0x804':
+            # 855
+            arch_type[i] = 'ARM_A76'
+        elif out == '0x805':
+            # 855
+            arch_type[i] = 'ARM_A55'
+        elif out == '0x802':
+            # 845
+            arch_type[i] = 'ARM_A75'
+        elif out == '0x803':
+            # 845
+            arch_type[i] = 'ARM_A55'
+        elif out == '0x801':
+            # 835
+            arch_type[i] = 'ARM_A73'
+        elif out == '0x800':
+            # 835
+            arch_type[i] = 'ARM_A73'
+        elif out == '0x205':
+            # 820
+            arch_type[i] = 'ARM_A72'
+        else:
+            arch_type[i] = 'UNKNOWN CPU ARCH'
+    return dev_info, core_num, arch_type
+
+def get_op_latency(op, platform):
+    """Get model latency.
+
+    Args:
+        op: list, a list of str represents the op and its parameters.
+        platform: str, platform name.
+
+    Returns:
+        float, op latency.
+    """
+    if platform == 'android':
+        commands = 'adb shell "cd /data/local/tmp/bin && ./get_{}_latency {}"'.format(
+            op[0], ' '.join(op[1:]))
+        proc = subprocess.Popen(
+            commands,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True)
+        out = proc.communicate()[0]
+        avg_out = [_ for _ in out.decode().split('\n') if 'Avg Latency' in _][-1]
+        avg_out = re.findall(r'\d+\.?\d*', avg_out)[0]
+        avg_out = float(avg_out)
+        min_out = [_ for _ in out.decode().split('\n') if 'Min Latency' in _][-1]
+        min_out = re.findall(r'\d+\.?\d*', min_out)[0]
+        min_out = float(min_out)
+        max_out = [_ for _ in out.decode().split('\n') if 'Max Latency' in _][-1]
+        max_out = re.findall(r'\d+\.?\d*', max_out)[0]
+        max_out = float(max_out)
+    elif platform == 'ios':
+        print('ios platform is not supported now')
+        sys.exit()
+    else:
+        print('Please define `get_op_latency` for {} platform'.format(platform))
+        sys.exit()
+    return avg_out, min_out, max_out
+
+def main():
+    args = get_args()
+    check_dev_connect()
+    conv_param_dict = {'ch_out': '1', 'stride':'[1 1]', 'pad':'[0 0 0 0]', 'kernel':'3x3',
+                       'group':'1', 'dilation':'[1 1]', 'flag_bias':'1',
+                       'flag_act':'0', 'dtype':'float'}
+    batchnorm_param_dict = {'epsilon':'1e-4f', 'momentum':'0.9f',
+                            'dtype':'float'}
+    pooling_param_dict = {'stride':'2', 'pad':'0', 'kernel':'2x2', 'ceil_mode':'0',
+                          'flag_global':'0', 'exclusive':'1', 'pooling_type': 'max',
+                          'dtype':'float'}
+    activation_param_dict = {'act_type':'relu', 'dtype':'float'}
+    fc_param_dict = {'param_dim':'1x1','flag_bias':'1', 'dtype':'float'}
+    op_info = {}
+    cur_op_name = ''
+    cur_param_dict = {}
+    input_dims = ''
+    output_dims = ''
+    runtime_cmd = []
+    fid = open(args.ops_path, 'r')
+    handle = open(args.latency_lookup_table_path, 'w')
+    handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('dev_info'.ljust(30), 'armv7/v8'.ljust(10), 'core_num'.ljust(10), 'thread_num'.ljust(10), 'power_mode'.ljust(10), 'core0 arch'.ljust(10), 'core1 arch'.ljust(10),
+                    'core2 arch'.ljust(10), 'core3 arch'.ljust(10), 'core4 arch'.ljust(10), 'core5 arch'.ljust(10),
+                    'core6 arch'.ljust(10), 'core7 arch'.ljust(10)))
+    dev_info, core_num, arch_type = get_dev_info()
+    handle.write('{}\t{}\t{}\t{}'.format(dev_info.ljust(30), str(args.arm_v7_v8).ljust(10), str(core_num).ljust(10), str(args.threads).ljust(10), str(args.power_mode).ljust(10)))
+    for i in arch_type:
+        handle.write('\t{}'.format(i).ljust(10))
+    handle.write('\n')
+    handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format('op_name'.ljust(10), 'input_dims'.ljust(10), 'output_dims'.ljust(10), 'param_info'.ljust(80), 'min_latency(ms)'.ljust(10), 'max_latency(ms)'.ljust(10), 'avg_latency(ms)'.ljust(10)))
+    for line in fid.readlines():
+        line = [line.strip('\n')]
+        for data_item in line:
+            data_item = data_item.strip().split('\t')
+            cur_op_name = data_item[0]
+            input_dims = data_item[1]
+            parameters = data_item[2].strip('( )').split(',')
+            for item_ in parameters:
+                item_ = item_.strip().split('=')
+                # conv op dict
+                if cur_op_name == 'conv':
+                    cur_param_dict = conv_param_dict
+                    if item_[0] == 'ch_out':
+                        cur_param_dict['ch_out'] = item_[1]
+                    elif item_[0] == 'stride':
+                        cur_param_dict['stride'] = item_[1]
+                    elif item_[0] == 'pad':
+                        cur_param_dict['pad'] = item_[1]
+                    elif item_[0] == 'kernel':
+                        cur_param_dict['kernel'] = item_[1]
+                    elif item_[0] == 'group':
+                        cur_param_dict['group'] = item_[1]
+                    elif item_[0] == 'dilation':
+                        cur_param_dict['dilation'] = item_[1]
+                    elif item_[0] == 'flag_bias':
+                        cur_param_dict['flag_bias'] = item_[1]
+                    elif item_[0] == 'flag_act':
+                        cur_param_dict['flag_act'] = item_[1]
+                    elif item_[0] == 'dtype':
+                        cur_param_dict['dtype'] = item_[1]
+                #batchnorm op dict
+                elif cur_op_name == 'batchnorm':
+                    cur_param_dict = batchnorm_param_dict
+                    if item_[0] == 'epsilon':
+                        cur_param_dict['epsilon'] = item_[1]
+                    elif item_[0] == 'momentum':
+                        cur_param_dict['momentum'] = item_[1]
+                #pooling op dict
+                elif cur_op_name == 'pooling':
+                    cur_param_dict = pooling_param_dict
+                    if item_[0] == 'stride':
+                        cur_param_dict['stride'] = item_[1]
+                    elif item_[0] == 'pad':
+                        cur_param_dict['pad'] = item_[1]
+                    elif item_[0] == 'kernel':
+                        cur_param_dict['kernel'] = item_[1]
+                    elif item_[0] == 'ceil_mode':
+                        cur_param_dict['ceil_mode'] = item_[1]
+                    elif item_[0] == 'flag_global':
+                        cur_param_dict['flag_global'] = item_[1]
+                    elif item_[0] == 'exclusive':
+                        cur_param_dict['exclusive'] = item_[1]
+                    elif item_[0] == 'pooling_type':
+                        cur_param_dict['pooling_type'] = item_[1]
+                #activation op dict
+                elif cur_op_name == 'activation':
+                    cur_param_dict = activation_param_dict
+                    if item_[0] == 'act_type':
+                        cur_param_dict['act_type'] = item_[1]
+                # fc op dict
+                elif cur_op_name == 'fc':
+                    cur_param_dict = fc_param_dict
+                    if item_[0] == 'param_dim':
+                        cur_param_dict['param_dim'] = item_[1]
+                    elif item_[0] == 'flag_bias':
+                        cur_param_dict['flag_bias'] = item_[1]
+                    elif item_[0] == 'dtype':
+                        cur_param_dict['dtype'] = 'float'
+        op_info[cur_op_name] = cur_param_dict
+
+        if cur_op_name == 'conv':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            out_ch = cur_param_dict['ch_out']
+            pad_top = cur_param_dict['pad'].strip('['  ']').split()[0]
+            pad_bottom = cur_param_dict['pad'].strip('['  ']').split()[1]
+            pad_left = cur_param_dict['pad'].strip('['  ']').split()[2]
+            pad_right = cur_param_dict['pad'].strip('['  ']').split()[0]
+            dila_h = cur_param_dict['dilation'].strip('['  ']').split()[0]
+            dila_w = cur_param_dict['dilation'].strip('['  ']').split()[1]
+            kernel_h = cur_param_dict['kernel'][0]
+            kernel_w = cur_param_dict['kernel'][2]
+            stride_h = cur_param_dict['stride'].strip('['  ']').split()[0]
+            stride_w = cur_param_dict['stride'].strip('['  ']').split()[1]
+            hout = (int(height) + int(pad_top) + int(pad_bottom) - int(dila_h) * 
+                   (int(kernel_h) - 1) + 1) / int(stride_h) + 1
+            wout = (int(width) + int(pad_left) + int(pad_right) - int(dila_w) * 
+                   (int(kernel_w) - 1) + 1) / int(stride_w) + 1
+            output_dims = '[' +  str(batch) + ' ' + str(out_ch) + ' ' + str(int(hout)) + ' ' + str(int(wout)) + ']'
+            dtype = 0
+            if cur_param_dict['dtype'] == 'float':
+                dtype = 0
+            elif cur_param_dict['dtype'] == 'int8_float':
+                dtype = 1
+            elif cur_param_dict['dtype'] == 'int8_int8':
+                dtype = 2
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width), str(out_ch),
+                           str(cur_param_dict['group']), str(cur_param_dict['kernel'])[0],
+                           str(pad_top), str(pad_bottom),
+                           str(pad_left), str(pad_right),
+                           str(stride_h), str(stride_w),
+                           str(dila_h), str(dila_w),
+                           str(cur_param_dict['flag_bias']), str(cur_param_dict['flag_act']),
+                           str(dtype)]
+        elif cur_op_name == 'batchnorm':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            output_dims = input_dims
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
+                           str(cur_param_dict['epsilon']), str(cur_param_dict['momentum'])]
+        elif cur_op_name == 'pooling':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            hout   = 1
+            wout   = 1
+            pad_top = cur_param_dict['pad'].strip('['  ']').split()[0]
+            pad_bottom = cur_param_dict['pad'].strip('['  ']').split()[1]
+            pad_left = cur_param_dict['pad'].strip('['  ']').split()[2]
+            pad_right = cur_param_dict['pad'].strip('['  ']').split()[3]
+            kernel_h = cur_param_dict['kernel'][0]
+            kernel_w = cur_param_dict['kernel'][2]
+            stride_h = cur_param_dict['stride'].strip('['  ']').split()[0]
+            stride_w = cur_param_dict['stride'].strip('['  ']').split()[1]
+            if cur_param_dict['flag_global'] == '0':
+                if cur_param_dict['ceil_mode'] == '0':
+                    hout = (int(height) - int(kernel_h) + int(pad_top) + int(pad_bottom)) / int(stride_h) + 1
+                    wout = (int(width) - int(kernel_w) + int(pad_left) + int(pad_right)) / int(stride_w) + 1
+                else:
+                    hout = (int(height) - int(kernel_h) + int(pad_top) + int(pad_bottom) + int(stride_h) - 1) / int(stride_h) + 1
+                    wout = (int(width) - int(kernel_w) + int(pad_left) + int(pad_right) + int(stride_w) - 1) / int(stride_w) + 1
+            output_dims = '[' + batch + ' ' + str(in_ch) + ' ' + str(int(hout)) + ' ' + str(int(wout)) + ']'
+            pooling_type = 0
+            if cur_param_dict['pooling_type'] == 'max':
+                pooling_type = 0
+            else:
+                pooling_type = 1
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
+                           str(stride_h), str(stride_w),
+                           str(pad_top), str(pad_bottom),
+                           str(pad_left), str(pad_right),
+                           str(cur_param_dict['kernel'])[0], str(cur_param_dict['ceil_mode']),
+                           str(cur_param_dict['flag_global']), str(cur_param_dict['exclusive']),
+                           str(pooling_type)]
+        elif cur_op_name == 'activation':
+            batch  = input_dims.strip('['  ']').split()[0]
+            in_ch  = input_dims.strip('['  ']').split()[1]
+            height = input_dims.strip('['  ']').split()[2]
+            width  = input_dims.strip('['  ']').split()[3]
+            act_type = 1
+            if cur_param_dict['act_type'] == 'relu':
+                act_type = 1
+            elif cur_param_dict['act_type'] == 'relu6':
+                act_type = 2
+            elif cur_param_dict['act_type'] == 'leaky_relu':
+                act_type = 4
+            elif cur_param_dict['act_type'] == 'sigmoid':
+                act_type = 5
+            elif cur_param_dict['act_type'] == 'tanh':
+                act_type = 6
+            elif cur_param_dict['act_type'] == 'swish':
+                act_type = 7
+            elif cur_param_dict['act_type'] == 'exp':
+                act_type = 8
+            elif cur_param_dict['act_type'] == 'abs':
+                act_type = 9
+            elif cur_param_dict['act_type'] == 'hard_swish':
+                act_type = 10
+            elif cur_param_dict['act_type'] == 'reciprocal':
+                act_type = 11
+            elif cur_param_dict['act_type'] == 'threshold_relu':
+                act_type = 12
+            output_dims = input_dims
+            runtime_cmd = [str(batch), str(in_ch), str(height), str(width),
+                           str(act_type)]
+        elif cur_op_name == 'fc':
+            m = input_dims.strip('['  ']').split()[0]
+            k = input_dims.strip('['  ']').split()[1]
+            n = cur_param_dict['param_dim'].split('x')[1]
+            output_dims = '[' + m + ' ' + n + ']'
+            runtime_cmd = [str(m), str(n), str(k), str(cur_param_dict['flag_bias']),
+                           str(cur_param_dict['dtype'])]
+
+        avg_latency, min_latency, max_latency = get_op_latency([cur_op_name] +
+                                 runtime_cmd + [str(args.threads), str(args.power_mode),
+                                 str(args.warmup_times), str(args.repeats_times)],
+                                 args.platform)
+
+        param_dict = ''
+        for k in cur_param_dict:
+            param_dict += str(k) + '=' + str(cur_param_dict[k]) + ','
+        param_dict = '(' + param_dict[:-1] + ')'
+        handle.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(cur_op_name.ljust(10), input_dims.ljust(10), output_dims.ljust(10), param_dict.ljust(80), str(min_latency).ljust(10), str(max_latency).ljust(10), str(avg_latency).ljust(10)))
+
+    fid.close()
+    handle.close()
+    print('Congratulations! Get Latency LookUp Table is Completed.')
+
+if __name__ == '__main__':
+    main()
diff --git a/lite/tests/benchmark/latency_lookup_table.txt b/lite/tests/benchmark/latency_lookup_table.txt
new file mode 100644
index 0000000000000000000000000000000000000000..13ce56c272d46bbaea8dbb58119422d2be870000
--- /dev/null
+++ b/lite/tests/benchmark/latency_lookup_table.txt
@@ -0,0 +1,8 @@
+dev_info                      	armv7/v8  	core_num  	thread_num	power_mode	core0 arch	core1 arch	core2 arch	core3 arch	core4 arch	core5 arch	core6 arch	core7 arch
+Hisilicon Kirin980            	armv8     	8         	1         	ARM_A55  	ARM_A55  	ARM_A55  	ARM_A55  	ARM_A76  	ARM_A76  	ARM_A76  	ARM_A76  
+op_name   	input_dims	output_dims	param_info                                                                      	min_latency(ms)	max_latency(ms)	avg_latency(ms)
+conv      	[1 96 112 112]	[1 48 114 114]	(ch_out=48,stride=[1 1],pad=[0 0 0 0],kernel=1x1,group=1,dilation=[1 1],flag_bias=0,flag_act=0,dtype=float)	3.472     	5.384     	3.97393   
+fc        	[4 8]     	[4 1000]  	(param_dim=8x1000,flag_bias=1,dtype=float)                                      	0.009     	0.023     	0.00951   
+batchnorm 	[1 8 64 64]	[1 8 64 64]	(epsilon=1e-4f,momentum=0.9f,dtype=float)                                       	0.01      	0.012     	0.0114    
+pooling   	[1 8 64 64]	[1 8 32 32]	(stride=[2 2],pad=[0 0 0 0],kernel=2x2,ceil_mode=0,flag_global=0,exclusive=0,pooling_type=avg,dtype=float)	0.009     	0.01      	0.00969   
+activation	[1 8 64 64]	[1 8 64 64]	(act_type=relu,dtype=float)                                                     	0.01      	0.028     	0.01098   
diff --git a/lite/tests/benchmark/ops.txt b/lite/tests/benchmark/ops.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f00497e23b575c91bbf1db9b957ba8ca728b27ea
--- /dev/null
+++ b/lite/tests/benchmark/ops.txt
@@ -0,0 +1,5 @@
+conv	[1 96 112 112]	(ch_out=48, stride=[1 1], group=1, kernel=1x1, pad=[0 0 0 0], dilation=[1 1], flag_bias=0, flag_act=0, dtype=float)
+fc	[4 8]	(flag_bias=1, param_dim=8x1000)
+batchnorm	[1 8 64 64]	(epsilon=1e-4f, momentum=0.9f)
+pooling	[1 8 64 64]	(stride=[2 2], kernel=2x2, pad=[0 0 0 0], exclusive=0, pooling_type=avg)
+activation	[1 8 64 64]	(act_type=relu)
diff --git a/lite/tests/benchmark/src/get_activation_latency.cc b/lite/tests/benchmark/src/get_activation_latency.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7a431274c74fe50d92b1ce0b25fe5bca001eaf52
--- /dev/null
+++ b/lite/tests/benchmark/src/get_activation_latency.cc
@@ -0,0 +1,311 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include <memory>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/activation_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+using paddle::lite::profile::Timer;
+
+int main(int argc, char** argv) {
+  if (argc != 10) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <act_type>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>" << std::endl;
+    return 0;
+  }
+
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  int thread_num = atoi(argv[6]);
+  int power_mode = atoi(argv[7]);
+  int warmup = atoi(argv[8]);
+  int repeats = atoi(argv[9]);
+  int act_type = atoi(argv[5]);
+  const float six = 6.f;
+  const float leakey_relu_scale = 8.88f;
+
+#ifdef LITE_WITH_ARM
+  ActivationParam act_param;
+  Tensor x, y;
+  DDim dim_in = DDim({batch_size, input_channel, input_height, input_width});
+  x.set_precision(PRECISION(kFloat));
+  x.Resize(dim_in);
+  paddle::lite::fill_tensor_rand(x, -1.f, 1.f);
+  act_param.X = &x;
+  act_param.active_type = (paddle::lite_api::ActivationType)act_type;
+  act_param.has_active = true;
+
+  if (act_type == 2) {
+    act_param.Relu_clipped_coef = six;
+  } else if (act_type == 4) {
+    act_param.Leaky_relu_alpha = leakey_relu_scale;
+  }
+
+  act_param.Out = &y;
+  act_param.Out->set_precision(PRECISION(kFloat));
+  act_param.Out->Resize(dim_in);
+
+  Timer t0;
+  if (act_type == 1) {
+    paddle::lite::kernels::arm::ReluCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 2) {
+    paddle::lite::kernels::arm::Relu6Compute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 4) {
+    paddle::lite::kernels::arm::LeakyReluCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 5) {
+    paddle::lite::kernels::arm::SigmoidCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 6) {
+    paddle::lite::kernels::arm::TanhCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 7) {
+    paddle::lite::kernels::arm::SwishCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 8) {
+    paddle::lite::kernels::arm::ExpCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 9) {
+    paddle::lite::kernels::arm::AbsCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 10) {
+    paddle::lite::kernels::arm::HardSwishCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 11) {
+    paddle::lite::kernels::arm::ReciprocalCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  } else if (act_type == 12) {
+    paddle::lite::kernels::arm::ThresholdedReluCompute act_compute;
+    act_compute.SetParam(act_param);
+    std::unique_ptr<paddle::lite::KernelContext> ctx1(
+        new paddle::lite::KernelContext);
+    auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+    ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                   thread_num);
+    act_compute.SetContext(std::move(ctx1));
+    act_compute.PrepareForRun();
+    // warm up
+    for (int i = 0; i < warmup; ++i) {
+      act_compute.Launch();
+    }
+    // compute
+    for (int i = 0; i < repeats; ++i) {
+      t0.Start();
+      act_compute.Launch();
+      t0.Stop();
+    }
+  }
+
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+#endif
+
+  return 0;
+}
diff --git a/lite/tests/benchmark/src/get_batchnorm_latency.cc b/lite/tests/benchmark/src/get_batchnorm_latency.cc
new file mode 100644
index 0000000000000000000000000000000000000000..19b8a4908b89fc6e38df9be8da88c95b14e3b9b5
--- /dev/null
+++ b/lite/tests/benchmark/src/get_batchnorm_latency.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/batch_norm_compute.h"
+#include "lite/operators/op_params.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::kernels::arm::BatchNormCompute BatchNormCompute;
+using paddle::lite::profile::Timer;
+
+int main(int argc, char** argv) {
+  if (argc != 11) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <epsilon>\n"
+              << "  <momentum>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  float epsilon = atof(argv[5]);
+  float momentum = atof(argv[6]);
+  int thread_num = atoi(argv[7]);
+  int power_mode = atoi(argv[8]);
+  int warmup = atoi(argv[9]);
+  int repeats = atoi(argv[10]);
+
+#ifdef LITE_WITH_ARM
+  Tensor x;
+  Tensor scale;
+  Tensor bias;
+  Tensor mean;
+  Tensor variance;
+  Tensor y;
+  Tensor mean_out;
+  Tensor variance_out;
+  Tensor saved_mean;
+  Tensor saved_variance;
+
+  std::vector<int64_t> in_out_shape = {
+      batch_size, input_channel, input_height, input_width};
+  x.Resize(in_out_shape);
+  scale.Resize({input_channel});
+  bias.Resize({input_channel});
+  mean.Resize({input_channel});
+  variance.Resize({input_channel});
+  y.Resize(in_out_shape);
+  mean_out.Resize({input_channel});
+  variance_out.Resize({input_channel});
+  saved_mean.Resize({input_channel});
+  saved_variance.Resize({input_channel});
+  // initialize the data of input tensors
+  auto* x_data = x.mutable_data<float>();
+  auto* scale_data = scale.mutable_data<float>();
+  auto* bias_data = bias.mutable_data<float>();
+  auto* mean_data = mean.mutable_data<float>();
+  auto* variance_data = variance.mutable_data<float>();
+  for (int i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i % 64);
+  }
+  for (int i = 0; i < scale.dims().production(); i++) {
+    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+  }
+  for (int i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+  }
+  for (int i = 0; i < mean.dims().production(); i++) {
+    mean_data[i] = static_cast<float>(i) * 0.0565f;
+  }
+  for (int i = 0; i < variance.dims().production(); i++) {
+    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+  }
+
+  // prepare kernel params and run
+  BatchNormCompute batch_norm;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+  batch_norm.SetContext(std::move(ctx1));
+
+  paddle::lite::operators::BatchNormParam param;
+  param.x = &x;
+  param.scale = &scale;
+  param.bias = &bias;
+  param.mean = &mean;
+  param.variance = &variance;
+  param.is_test = false;
+  param.use_global_stats = true;
+  param.epsilon = epsilon;
+  param.momentum = momentum;
+  param.data_layout = DATALAYOUT(kNCHW);
+  param.y = &y;
+  param.mean_out = &mean_out;
+  param.variance_out = &variance_out;
+  param.saved_mean = &saved_mean;
+  param.saved_variance = &saved_variance;
+  batch_norm.SetParam(param);
+
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    batch_norm.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    batch_norm.Launch();
+    t0.Stop();
+  }
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+#endif
+
+  return 0;
+}
diff --git a/lite/tests/benchmark/src/get_conv_latency.cc b/lite/tests/benchmark/src/get_conv_latency.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fc8a20849309270d90e7bb0b8dd52172d843e12
--- /dev/null
+++ b/lite/tests/benchmark/src/get_conv_latency.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/conv_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::operators::ConvParam ConvParam;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+
+using paddle::lite::profile::Timer;
+using paddle::lite_api::PrecisionType;
+
+DDim compute_out_dim(const DDim& dim_in,
+                     const paddle::lite::operators::ConvParam& param) {
+  DDim dim_out = dim_in;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  dim_out[1] = param.filter->dims()[0];
+  auto kernel_h = param.filter->dims()[2];
+  auto kernel_w = param.filter->dims()[3];
+  auto h = dim_in[2];
+  auto w = dim_in[3];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_top = paddings[0];
+  int pad_bottom = paddings[1];
+  int pad_left = paddings[2];
+  int pad_right = paddings[3];
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  auto kernel_exten = dila_h * (kernel_h - 1) + 1;
+  auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1;
+  kernel_exten = dila_w * (kernel_w - 1) + 1;
+  auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1;
+  dim_out[2] = hout;
+  dim_out[3] = wout;
+  return dim_out;
+}
+
+template <PrecisionType Ptype, PrecisionType OutType>
+void test_conv(const DDim& input_dims,
+               const DDim& weight_dims,
+               const int group,
+               const std::vector<int>& strides,
+               const std::vector<int>& pads,
+               const std::vector<int>& dilas,
+               const bool flag_bias,
+               const int flag_act,
+               const int thread_num,
+               const int power_mode,
+               const int warmup,
+               const int repeats,
+               const float leakey_relu_scale = 8.88f) {
+  ConvParam param;
+  Tensor x, f, y;
+  Tensor bias;
+  param.x = &x;
+  param.x->set_precision(Ptype);
+  param.filter = &f;
+  param.filter->Resize(weight_dims);
+  param.filter->set_precision(Ptype);
+  if (flag_bias) {
+    param.bias = &bias;
+    param.bias->Resize({weight_dims[0]});
+    param.bias->set_precision(PRECISION(kFloat));
+  }
+  param.strides = strides;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
+  param.groups = group;
+  const float six = 6.f;
+
+  if (Ptype == PRECISION(kInt8)) {
+    std::vector<float> scale_in{1.f / 127};
+    std::vector<float> scale_out(1, weight_dims.count(1, 4) / 127.f);
+    if (flag_act == 2) {
+      scale_out[0] = six / 127.f;
+    } else if (flag_act == 4) {
+      if (std::abs(leakey_relu_scale) > 1) {
+        scale_out[0] *= std::abs(leakey_relu_scale);
+      }
+    }
+    std::vector<float> scale_w(weight_dims[0], 1.f / 127);
+    param.input_scale = scale_in[0];
+    param.output_scale = scale_out[0];
+    param.weight_scale = scale_w;
+  }
+
+  if (flag_act > 0) {
+    ActivationParam act_param;
+    act_param.has_active = true;
+    act_param.active_type = (paddle::lite_api::ActivationType)
+        flag_act;  // 1-relu, 2-relu6, 4-leakyrelu
+    if (flag_act == 1) {
+      param.fuse_relu = true;
+    } else if (flag_act == 2) {
+      act_param.Relu_clipped_coef = six;
+    } else if (flag_act == 4) {
+      act_param.Leaky_relu_alpha = leakey_relu_scale;
+    }
+    param.activation_param = act_param;
+  }
+
+  param.output = &y;
+  param.output->set_precision(OutType);
+
+  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
+  if (flag_bias) {
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+  }
+
+  paddle::lite::kernels::arm::ConvCompute<Ptype, OutType> conv;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+
+  param.x->Resize(input_dims);
+  DDim dim_out = compute_out_dim(input_dims, param);
+  param.output->Resize(dim_out);
+  conv.SetParam(param);
+  conv.SetContext(std::move(ctx1));
+  conv.PrepareForRun();
+  paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    conv.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    conv.Launch();
+    t0.Stop();
+  }
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+}
+
+int main(int argc, char** argv) {
+  if (argc != 23) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <output_channel>\n"
+              << "  <group_size>\n"
+              << "  <kernel_size>\n"
+              << "  <pad_top>\n"
+              << "  <pad_bottom>\n"
+              << "  <pad_left>\n"
+              << "  <pad_right>\n"
+              << "  <stride_h>\n"
+              << "  <stride_w>\n"
+              << "  <dilation_h>\n"
+              << "  <dilation_w>\n"
+              << "  <flag_bias>\n"
+              << "  <flag_act>\n"
+              << "  <dtype>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  int output_channel = atoi(argv[5]);
+  int group_size = atoi(argv[6]);
+  int kernel_size = atoi(argv[7]);
+  int pad_top = atoi(argv[8]);
+  int pad_bottom = atoi(argv[9]);
+  int pad_left = atoi(argv[10]);
+  int pad_right = atoi(argv[11]);
+  int stride_h = atoi(argv[12]);
+  int stride_w = atoi(argv[13]);
+  int dilation_h = atoi(argv[14]);
+  int dilation_w = atoi(argv[15]);
+  int flag_bias = atoi(argv[16]);
+  int flag_act = atoi(argv[17]);
+  int dtype = atoi(argv[18]);
+  int thread_num = atoi(argv[19]);
+  int power_mode = atoi(argv[20]);
+  int warmup = atoi(argv[21]);
+  int repeats = atoi(argv[22]);
+
+  DDim weight_dims(
+      {output_channel, input_channel / group_size, kernel_size, kernel_size});
+  DDim input_dims({batch_size, input_channel, input_height, input_width});
+  switch (dtype) {
+    case 0:
+      test_conv<PRECISION(kFloat), PRECISION(kFloat)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+      break;
+    case 1:
+      test_conv<PRECISION(kInt8), PRECISION(kFloat)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+      break;
+    case 2:
+      test_conv<PRECISION(kInt8), PRECISION(kInt8)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+      break;
+    default:
+      test_conv<PRECISION(kFloat), PRECISION(kFloat)>(
+          input_dims,
+          weight_dims,
+          group_size,
+          {stride_h, stride_w},
+          {pad_top, pad_bottom, pad_left, pad_right},
+          {dilation_h, dilation_w},
+          flag_bias,
+          flag_act,
+          thread_num,
+          power_mode,
+          warmup,
+          repeats);
+  }
+
+  return 0;
+}
diff --git a/lite/tests/benchmark/src/get_fc_latency.cc b/lite/tests/benchmark/src/get_fc_latency.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66c6948c3e6b988e7c5c689a70af4344d97047e2
--- /dev/null
+++ b/lite/tests/benchmark/src/get_fc_latency.cc
@@ -0,0 +1,146 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/fc_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::FcParam FcParam;
+using paddle::lite::profile::Timer;
+using paddle::lite_api::PrecisionType;
+
+template <PrecisionType Ptype, PrecisionType OutType>
+void test_fc(const int m,
+             const int n,
+             const int k,
+             const bool has_bias,
+             const int thread_num,
+             const int power_mode,
+             const int warmup,
+             const int repeats) {
+  FcParam param;
+  Tensor x, y, bias, w;
+  param.input = &x;
+  param.input->set_precision(Ptype);
+  param.input->Resize({m, k});
+  param.w = &w;
+  param.w->set_precision(Ptype);
+  param.w->Resize({k, n});
+  if (has_bias) {
+    param.bias = &bias;
+    param.bias->set_precision(Ptype);
+    param.bias->Resize({1, n});
+  } else {
+    param.bias = nullptr;
+  }
+  param.output = &y;
+  param.output->set_precision(OutType);
+  param.output->Resize({m, n});
+
+  param.in_num_col_dims = 1;
+  param.in_mat_dims = param.input->dims();
+
+  paddle::lite::kernels::arm::FcCompute<Ptype, OutType> fc_compute;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+  // set param and context
+  fc_compute.SetParam(param);
+  fc_compute.SetContext(std::move(ctx1));
+  // prepare for run
+  fc_compute.PrepareForRun();
+  paddle::lite::fill_tensor_rand(*param.input, -1.f, 1.f);
+  paddle::lite::fill_tensor_rand(*param.w, -1.f, 1.f);
+
+  if (has_bias) {
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+  }
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    fc_compute.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    fc_compute.Launch();
+    t0.Stop();
+  }
+
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+}
+
+int main(int argc, char** argv) {
+  if (argc != 10) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << " <m>\n"
+              << " <n>\n"
+              << " <k>\n"
+              << " <has_bias>\n"
+              << " <dtype>\n"
+              << " <thread_num>\n"
+              << " <power_mode>\n"
+              << " <warmup_times>\n"
+              << " <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int m = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int k = atoi(argv[3]);
+  bool has_bias = atoi(argv[4]) == 0 ? false : true;
+  int dtype = argv[5] == "int8_int8" ? 2 : argv[5] == "float_int8"
+                                               ? 1
+                                               : argv[5] == "float" ? 0 : 0;
+  int thread_num = atoi(argv[6]);
+  int power_mode = atoi(argv[7]);
+  int warmup = atoi(argv[8]);
+  int repeats = atoi(argv[9]);
+
+  switch (dtype) {
+    case 0:
+      test_fc<PRECISION(kFloat), PRECISION(kFloat)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+    case 1:
+      test_fc<PRECISION(kInt8), PRECISION(kFloat)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+    case 2:
+      test_fc<PRECISION(kInt8), PRECISION(kInt8)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+    default:
+      test_fc<PRECISION(kFloat), PRECISION(kFloat)>(
+          m, n, k, has_bias, thread_num, power_mode, warmup, repeats);
+      break;
+  }
+
+  return 0;
+}
diff --git a/lite/tests/benchmark/src/get_pooling_latency.cc b/lite/tests/benchmark/src/get_pooling_latency.cc
new file mode 100644
index 0000000000000000000000000000000000000000..126427e50255a1ef5a6a07e645f119e1e66a48e3
--- /dev/null
+++ b/lite/tests/benchmark/src/get_pooling_latency.cc
@@ -0,0 +1,160 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <iostream>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/arm/pool_compute.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::operators::PoolParam PoolParam;
+using paddle::lite::profile::Timer;
+
+DDim compute_out_dim(const DDim& dim_in,
+                     const paddle::lite::operators::PoolParam& param) {
+  DDim dim_out = dim_in;
+  auto kernel_h = param.ksize[0];
+  auto kernel_w = param.ksize[1];
+  auto h = dim_in[2];
+  auto w = dim_in[3];
+  auto paddings = *param.paddings;
+  int stride_h = param.strides[0];
+  int stride_w = param.strides[1];
+  bool ceil_mode = param.ceil_mode;
+  bool flag_global = param.global_pooling;
+  int hout = 1;
+  int wout = 1;
+  if (!flag_global) {
+    if (!ceil_mode) {
+      hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
+      wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
+    } else {
+      hout =
+          (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
+          1;
+      wout =
+          (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
+          1;
+    }
+  }
+  dim_out[2] = hout;
+  dim_out[3] = wout;
+  return dim_out;
+}
+
+int main(int argc, char** argv) {
+  if (argc != 20) {
+    std::cerr << "usage: " << argv[0] << "\n"
+              << "  <batch_size>\n"
+              << "  <input_channel>\n"
+              << "  <input_height>\n"
+              << "  <input_width>\n"
+              << "  <kernel_size>\n"
+              << "  <stride_size>\n"
+              << "  <pad_size>\n"
+              << "  <exclusive>\n"
+              << "  <pooling_type>\n"
+              << "  <ceil_mode>\n"
+              << "  <flag_global>\n"
+              << "  <thread_num>\n"
+              << "  <power_mode>\n"
+              << "  <warmup_times>\n"
+              << "  <repeats_times>\n"
+              << std::endl;
+    return 0;
+  }
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  int batch_size = atoi(argv[1]);
+  int input_channel = atoi(argv[2]);
+  int input_height = atoi(argv[3]);
+  int input_width = atoi(argv[4]);
+  int stride_h = atoi(argv[5]);
+  int stride_w = atoi(argv[6]);
+  int pad_top = atoi(argv[7]);
+  int pad_bottom = atoi(argv[8]);
+  int pad_left = atoi(argv[9]);
+  int pad_right = atoi(argv[10]);
+  int kernel_size = atoi(argv[11]);
+  bool ceil_mode = argv[12] == 0 ? false : true;
+  bool flag_global = argv[13] == 0 ? false : true;
+  bool exclusive = atoi(argv[14]) == 0 ? false : true;
+  std::string pooling_type = atoi(argv[15]) == 0 ? "max" : "avg";
+  int thread_num = atoi(argv[16]);
+  int power_mode = atoi(argv[17]);
+  int warmup = atoi(argv[18]);
+  int repeats = atoi(argv[19]);
+
+#ifdef LITE_WITH_ARM
+  PoolParam param;
+  Tensor x, y;
+  param.x = &x;
+  param.x->set_precision(PRECISION(kFloat));
+  param.ksize = {kernel_size, kernel_size};
+  param.strides = {stride_h, stride_w};
+  param.paddings = std::make_shared<std::vector<int>>(
+      std::vector<int>{pad_top, pad_bottom, pad_left, pad_right});
+  param.ceil_mode = ceil_mode;
+  param.global_pooling = flag_global;
+  param.pooling_type = pooling_type;
+  param.exclusive = exclusive;
+  param.adaptive = false;
+  param.use_quantizer = false;
+  param.output = &y;
+  param.output->set_precision(PRECISION(kFloat));
+
+  paddle::lite::kernels::arm::PoolCompute pool;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(power_mode),
+                 thread_num);
+  // set param and context
+  pool.SetParam(param);
+  pool.SetContext(std::move(ctx1));
+  // prepare for run
+  pool.PrepareForRun();
+  DDim dim_in = DDim({batch_size, input_channel, input_height, input_width});
+  DDim dim_out = compute_out_dim(dim_in, param);
+
+  param.x->Resize(dim_in);
+  param.output->Resize(dim_out);
+
+  paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+  // warm up
+  for (int i = 0; i < warmup; ++i) {
+    pool.Launch();
+  }
+  // compute
+  Timer t0;
+  for (int i = 0; i < repeats; ++i) {
+    t0.Start();
+    pool.Launch();
+    t0.Stop();
+  }
+
+  printf("Avg Latency is %f\n", t0.LapTimes().Avg());
+  printf("Min Latency is %f\n", t0.LapTimes().Min());
+  printf("Max Latency is %f\n", t0.LapTimes().Max());
+#endif
+
+  return 0;
+}
diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt
index 1ab73792e7fa3a46fd4c4b4479e4f231d55608f6..02c6515a45f1ede9409b2ee8bc05fd978cc78e99 100644
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
@@ -1,3 +1,4 @@
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
     lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
+    lite_cc_test(image_profiler_test SRCS image_profiler_test.cc DEPS paddle_cv_arm anakin_cv_arm)
 endif()
diff --git a/lite/tests/cv/anakin/CMakeLists.txt b/lite/tests/cv/anakin/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a282b17c8ae174883806941d880b16e768cf317c
--- /dev/null
+++ b/lite/tests/cv/anakin/CMakeLists.txt
@@ -0,0 +1,18 @@
+if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_MLU) AND LITE_WITH_ARM)
+  lite_cc_library(anakin_cv_arm SRCS
+                  bgr_resize.cc
+                  bgr_flip_hwc.cc
+                  bgr_rotate_hwc.cc
+                  bgr_to_tensor_hwc.cc
+                  bgra_resize.cc
+                  bgra_flip_hwc.cc
+                  bgra_rotate_hwc.cc
+                  bgra_to_tensor_hwc.cc
+                  cv_utils.cc
+                  nv12_to_bgr.cc
+                  nv12_to_bgra.cc
+                  nv21_to_bgr.cc
+                  nv21_to_bgra.cc
+                  nv21_resize.cc
+                  DEPS paddle_api place)
+endif()
diff --git a/lite/tests/cv/anakin/bgr_flip_hwc.cc b/lite/tests/cv/anakin/bgr_flip_hwc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ccc1e6575cdd56558eb94131c4d8debed4577c73
--- /dev/null
+++ b/lite/tests/cv/anakin/bgr_flip_hwc.cc
@@ -0,0 +1,1081 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgr_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int flip_num) {
+  if (flip_num == 1) {  // x
+    flip_x_hwc(src, dst, w_in, h_in);
+  }
+  if (flip_num == -1) {  // y
+    flip_y_hwc(src, dst, w_in, h_in);
+  }
+  if (flip_num == 0) {  // xy
+    flip_xy_hwc(src, dst, w_in, h_in);
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr8 bgr9
+bgr4 bgr5 bgr6
+bgr1 bgr2 bgr3
+*/
+#ifdef __aarch64__
+void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int h = h_in - 1;
+  int win = w_in * 3;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24             \n"   // 00
+                                                                          // 10
+                                                                          // 20
+                                                                          // 30
+                                                                          // 04
+                                                                          // 14
+                                                                          // 24
+                                                                          // 34
+          "st3 {v3.8b, v4.8b, v5.8b}, [%[outptr1]], #24              \n"  // 02
+                                                                          // 12
+                                                                          // 22
+                                                                          // 32
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr2]], #24             \n"   // 01
+                                                                          // 11
+                                                                          // 21
+                                                                          // 31
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr3]], #24              \n"  // 03 13 23 33
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#else
+void flip_x_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int win = w_in * 3;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  int h = h_in - 1;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d0, d1, d2},    [%[outptr0]]!   @ write d0(q0,low),r00,r10 "
+          "20 30\n"
+          "vst3.8  {d3, d4, d5},    [%[outptr1]]!   @ write d4(q0,low),r01,r11 "
+          "21 31\n"
+          "vst3.8  {d6, d7, d8},    [%[outptr2]]!   @ write d4(q0,low),r01,r11 "
+          "21 31\n"
+          "vst3.8  {d9, d10, d11},    [%[outptr3]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr3 bgr2 bgr1
+bgr6 bgr5 bgr4
+bgr9 bgr8 bgr7
+*/
+#ifdef __aarch64__
+void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#else
+void flip_y_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+#ifdef __aarch64__
+void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int stride_w = 24;
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#else
+void flip_xy_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#endif
diff --git a/lite/tests/cv/anakin/bgr_resize.cc b/lite/tests/cv/anakin/bgr_resize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26d511bebeb05810a21eb42ab027b6e7978381ad
--- /dev/null
+++ b/lite/tests/cv/anakin/bgr_resize.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void resize_three_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void bgr_resize(const uint8_t* src,
+                uint8_t* dst,
+                int w_in,
+                int h_in,
+                int w_out,
+                int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(char) * w_in * h_in * 3);
+    return;
+  }
+  // y
+  resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
+}
+void resize_three_channel(const uint8_t* src,
+                          int w_in,
+                          int h_in,
+                          uint8_t* dst,
+                          int w_out,
+                          int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 3; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 3;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 3;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
diff --git a/lite/tests/cv/anakin/bgr_rotate_hwc.cc b/lite/tests/cv/anakin/bgr_rotate_hwc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fcbd3b7692f8d76d29b1f7308ec70880a34821c2
--- /dev/null
+++ b/lite/tests/cv/anakin/bgr_rotate_hwc.cc
@@ -0,0 +1,1478 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc(src, dst, w_in, h_in);
+  }
+}
+
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+#ifdef __aarch64__
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // uint8_t* dst = new uint8_t[w_out * h_out * 3];
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      uint8_t* outptr1 = outptr0 + wout;
+      uint8_t* outptr2 = outptr1 + wout;
+      uint8_t* outptr3 = outptr2 + wout;
+      uint8_t* outptr4 = outptr3 + wout;
+      uint8_t* outptr5 = outptr4 + wout;
+      uint8_t* outptr6 = outptr5 + wout;
+      uint8_t* outptr7 = outptr6 + wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v12.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v13.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v14.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+          "rev64  v15.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v16.8b, v16.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v17.8b, v17.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          // "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"
+          // //00 10 20 30 04 14 24 34
+          // "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"
+          // //02 12 22 32
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "rev64  v6.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v7.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v8.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v24.8b, v24.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v25.8b, v25.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v26.8b, v26.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v9.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v10.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v11.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v27.8b, v27.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v28.8b, v28.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v29.8b, v29.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "rev64  v0.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 b
+          "rev64  v1.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 g
+          "rev64  v2.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00 r
+
+          "rev64  v18.8b, v18.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 b
+          "rev64  v19.8b, v19.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 g
+          "rev64  v20.8b, v20.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00 r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29",
+            "v30");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (w_out - 1 - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate90_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 3;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+#ifdef __aarch64__
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      uint8_t* outptr1 = outptr0 - wout;
+      uint8_t* outptr2 = outptr1 - wout;
+      uint8_t* outptr3 = outptr2 - wout;
+      uint8_t* outptr4 = outptr3 - wout;
+      uint8_t* outptr5 = outptr4 - wout;
+      uint8_t* outptr6 = outptr5 - wout;
+      uint8_t* outptr7 = outptr6 - wout;
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]]    \n"  // v0={00,01,02, 03,
+                                                           // 04, 05, 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]]    \n"  // v0={10,11,12, 13,
+                                                           // 14, 15, 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "add %[inptr0], %[inptr0], %[stride_h] \n"  // 4 + 4*w_in
+          "add %[inptr1], %[inptr1], %[stride_h] \n"  // 5
+          "add %[inptr2], %[inptr2], %[stride_h] \n"  // 6
+          "add %[inptr3], %[inptr3], %[stride_h] \n"  // 7
+
+          // b
+          "trn1 v12.8b, v0.8b, v3.8b             \n"  // v4={00 10 02 12 04 14
+                                                      // 06 16 }
+          "trn1 v15.8b, v6.8b, v9.8b             \n"  // v4={20 30 22 32 24 34
+                                                      // 26 36 }
+
+          "trn2 v18.8b, v0.8b, v3.8b             \n"  // v5={01 11 03 13 05 15
+                                                      // 07 17 }
+          "trn2 v21.8b, v6.8b, v9.8b             \n"  // v7={21 31 23 33 25 35
+                                                      // 27 37 }
+
+          // g
+          "trn1 v13.8b, v1.8b, v4.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v16.8b, v7.8b, v10.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v19.8b, v1.8b, v4.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v22.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // r
+          "trn1 v14.8b, v2.8b, v5.8b             \n"   // v4={00 10 02 12 04 14
+                                                       // 06 16 }
+          "trn1 v17.8b, v8.8b, v11.8b             \n"  // v4={20 30 22 32 24 34
+                                                       // 26 36 }
+
+          "trn2 v20.8b, v2.8b, v5.8b             \n"   // v5={01 11 03 13 05 15
+                                                       // 07 17 }
+          "trn2 v23.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b1
+          "trn1 v24.4h, v12.4h, v15.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v27.4h, v18.4h, v21.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v0.4h, v12.4h, v15.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v3.4h, v18.4h, v21.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // g1
+          "trn1 v25.4h, v13.4h, v16.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v28.4h, v19.4h, v22.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v1.4h, v13.4h, v16.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v4.4h, v19.4h, v22.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          // r1
+          "trn1 v26.4h, v14.4h, v17.4h             \n"  // v0={00 10 20 30 04 14
+                                                        // 24 34}
+          "trn1 v29.4h, v20.4h, v23.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          "trn2 v2.4h, v14.4h, v17.4h             \n"  // v1={02 12 22 32 06 16
+                                                       // 26 36}
+          "trn2 v5.4h, v20.4h, v23.4h             \n"  // v3={03 13 23 33 07 17
+                                                       // 27 37}
+
+          "ld3  {v12.8b, v13.8b, v14.8b}, [%[inptr0]]    \n"  // v0={00,01,02,
+                                                              // 03, 04, 05, 06,
+                                                              // 07}"
+          "ld3  {v15.8b, v16.8b, v17.8b}, [%[inptr1]]    \n"  // v0={10,11,12,
+                                                              // 13, 14, 15, 16,
+                                                              // 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]]    \n"  // v0={20,21,22, 23,
+                                                           // 24, 25, 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]]    \n"  // v0={30,31,32,
+                                                             // 33, 34, 35, 36,
+                                                             // 37}"
+
+          "sub %[inptr0], %[inptr0], %[stride_h_w] \n"  // 4 - 4*w_in + 8
+          "sub %[inptr1], %[inptr1], %[stride_h_w] \n"  // 5
+          "sub %[inptr2], %[inptr2], %[stride_h_w] \n"  // 6
+          "sub %[inptr3], %[inptr3], %[stride_h_w] \n"  // 7
+
+          // b2
+          "trn1 v18.8b, v12.8b, v15.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v21.8b, v6.8b, v9.8b             \n"    // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // g2
+          "trn1 v19.8b, v13.8b, v16.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v22.8b, v7.8b, v10.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+          // r2
+          "trn1 v20.8b, v14.8b, v17.8b             \n"  // v4={00 10 02 12 04 14
+                                                        // 06 16 }
+          "trn1 v23.8b, v8.8b, v11.8b             \n"   // v4={20 30 22 32 24 34
+                                                        // 26 36 }
+
+          "trn2 v12.8b, v12.8b, v15.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v13.8b, v13.8b, v16.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+          "trn2 v14.8b, v14.8b, v17.8b             \n"  // v5={01 11 03 13 05 15
+                                                        // 07 17 }
+
+          "trn2 v15.8b, v6.8b, v9.8b             \n"   // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v16.8b, v7.8b, v10.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+          "trn2 v17.8b, v8.8b, v11.8b             \n"  // v7={21 31 23 33 25 35
+                                                       // 27 37 }
+
+          // b2
+          "trn1 v6.4h, v18.4h, v21.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // g2
+          "trn1 v7.4h, v19.4h, v22.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+          // r2
+          "trn1 v8.4h, v20.4h, v23.4h             \n"  // v0={00 10 20 30 04 14
+                                                       // 24 34}
+
+          // bgr
+          "trn1 v9.4h, v12.4h, v15.4h             \n"   // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v10.4h, v13.4h, v16.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+          "trn1 v11.4h, v14.4h, v17.4h             \n"  // v2={01 11 21 31 05 15
+                                                        // 25 35}
+
+          // bgr
+          "trn2 v18.4h, v18.4h, v21.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v19.4h, v19.4h, v22.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+          "trn2 v20.4h, v20.4h, v23.4h             \n"  // v1={02 12 22 32 06 16
+                                                        // 26 36}
+
+          // bgr
+          "trn2 v21.4h, v12.4h, v15.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v22.4h, v13.4h, v16.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+          "trn2 v23.4h, v14.4h, v17.4h             \n"  // v3={03 13 23 33 07 17
+                                                        // 27 37}
+
+          // b1 b2
+          "trn1 v12.2s, v24.2s, v6.2s             \n"  // v8={00 10 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v13.2s, v25.2s, v7.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v14.2s, v26.2s, v8.2s             \n"  // v6={00 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v15.2s, v24.2s, v6.2s             \n"  // v8={04 14 24 34 44 54
+                                                       // 64 74} b
+          "trn2 v16.2s, v25.2s, v7.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} g
+          "trn2 v17.2s, v26.2s, v8.2s             \n"  // v6={04 14 24 34 44 54
+                                                       // 64 74} r
+
+          // b1 b2
+          "trn1 v6.2s, v27.2s, v9.2s             \n"   // v8={01 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v7.2s, v28.2s, v10.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v8.2s, v29.2s, v11.2s             \n"  // v6={01 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn2 v24.2s, v27.2s, v9.2s             \n"   // v8={05 10 20 30 40 50
+                                                        // 60 70} b
+          "trn2 v25.2s, v28.2s, v10.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} g
+          "trn2 v26.2s, v29.2s, v11.2s             \n"  // v6={05 10 20 30 40 50
+                                                        // 60 70} r
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]], #24             \n"  // 00 10 20 30 04 14 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr4]], #24             \n"  // 02 12 22 32
+          // b1 b2
+          "trn1 v9.2s, v0.2s, v18.2s             \n"   // v8={02 11 20 30 40 50
+                                                       // 60 70} b
+          "trn1 v10.2s, v1.2s, v19.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} g
+          "trn1 v11.2s, v2.2s, v20.2s             \n"  // v6={02 10 20 30 40 50
+                                                       // 60 70} r
+
+          "trn2 v27.2s, v0.2s, v18.2s             \n"  // v8={06 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v28.2s, v1.2s, v19.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v29.2s, v2.2s, v20.2s             \n"  // v6={06 10 20 30 40 50
+                                                       // 60 70} r
+
+          // b1 b2
+          "trn1 v0.2s, v3.2s, v21.2s             \n"  // v8={03 11 20 30 40 50
+                                                      // 60 70} b
+          "trn1 v1.2s, v4.2s, v22.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} g
+          "trn1 v2.2s, v5.2s, v23.2s             \n"  // v6={03 10 20 30 40 50
+                                                      // 60 70} r
+
+          "trn2 v18.2s, v3.2s, v21.2s             \n"  // v8={07 11 20 30 40 50
+                                                       // 60 70} b
+          "trn2 v19.2s, v4.2s, v22.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} g
+          "trn2 v20.2s, v5.2s, v23.2s             \n"  // v6={07 10 20 30 40 50
+                                                       // 60 70} r
+
+          "st3 {v6.8b, v7.8b, v8.8b}, [%[outptr1]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v24.8b, v25.8b, v26.8b}, [%[outptr5]], #24             \n"  // 02 12 22 32
+
+          "st3 {v9.8b, v10.8b, v11.8b}, [%[outptr2]], #24             \n"  // 00
+                                                                           // 10
+                                                                           // 20
+                                                                           // 30
+                                                                           // 04
+                                                                           // 14
+                                                                           // 24
+                                                                           // 34
+          "st3 {v27.8b, v28.8b, v29.8b}, [%[outptr6]], #24             \n"  // 02 12 22 32
+
+          "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr3]], #24             \n"  // 00
+                                                                         // 10
+                                                                         // 20
+                                                                         // 30
+                                                                         // 04
+                                                                         // 14
+                                                                         // 24
+                                                                         // 34
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr7]], #24             \n"  // 02 12 22 32
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [outptr4] "+r"(outptr4),
+            [outptr5] "+r"(outptr5),
+            [outptr6] "+r"(outptr6),
+            [outptr7] "+r"(outptr7),
+            [stride_h] "+r"(stride_h),
+            [stride_h_w] "+r"(stride_h_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23",
+            "v24",
+            "v25",
+            "v26",
+            "v27",
+            "v28",
+            "v29");
+    }
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#else
+void rotate270_hwc(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 3;
+  int wout = w_out * 3;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 24;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+    int j = 0;
+
+    for (; j < w_in; j++) {
+      int tmpx = i * 3;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 3;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+#ifdef __aarch64__
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "ld3  {v0.8b, v1.8b, v2.8b}, [%[inptr0]], #24    \n"  // v0={00,01,02,
+                                                                // 03, 04, 05,
+                                                                // 06, 07}"
+          "ld3  {v3.8b, v4.8b, v5.8b}, [%[inptr1]], #24     \n"  // v0={10,11,12,
+                                                                 // 13, 14, 15,
+                                                                 // 16, 17}"
+          "ld3  {v6.8b, v7.8b, v8.8b}, [%[inptr2]], #24    \n"  // v0={20,21,22,
+                                                                // 23, 24, 25,
+                                                                // 26, 27}"
+          "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
+                                                                  // 33, 34, 35,
+                                                                  // 36, 37}"
+
+          "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v14.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+
+          "rev64  v15.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v16.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v17.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v18.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v19.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v20.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v21.8b, v9.8b                \n"   //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v22.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+          "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                     // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
+                                                                        // 20 30
+                                                                        // 04 14
+                                                                        // 24 34
+          "st3 {v15.8b, v16.8b, v17.8b}, [%[outptr1]]              \n"  // 02 12
+                                                                        // 22 32
+          "st3 {v18.8b, v19.8b, v20.8b}, [%[outptr2]]             \n"   // 01 11
+                                                                        // 21 31
+          "st3 {v21.8b, v22.8b, v23.8b}, [%[outptr3]]              \n"  // 03 13
+                                                                        // 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#else
+void rotate180_hwc(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 3;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 24;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 3:
+            inptr0 = zerobuff;
+            outptr0 = zerobuff;
+          case 2:
+            inptr1 = zerobuff;
+            outptr1 = zerobuff;
+          case 1:
+            inptr2 = zerobuff;
+            outptr2 = zerobuff;
+          case 0:
+            inptr3 = zerobuff;
+            outptr3 = zerobuff;
+          default:
+            break;
+        }
+      }
+      asm volatile(
+          "vld3.8  {d0, d1, d2}, [%[inptr0]]!   @ zip load r0, d0 =00 01 02 03 "
+          "04 05 06 07\n"
+          "vld3.8  {d3, d4, d5}, [%[inptr1]]!   @ zip load r1, d2 =10 11 12 13 "
+          "14 15 16 17\n"
+          "vld3.8  {d6, d7, d8}, [%[inptr2]]!   @ zip load r1, d4 =20 21 22 23 "
+          "24 25 26 27\n"
+          "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
+          "33 34 35 36 37\n"
+
+          "vrev64.8  d12, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d13, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d14, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d15, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d16, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d18, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d20, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d21, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d10               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+          "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
+          "\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d18, d19, d20},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst3.8  {d21, d22, d23},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 3;
+    outptr2 += stride_w - 3;
+    outptr1 += stride_w - 3;
+    outptr0 += stride_w - 3;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 6;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 6;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 6;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 6;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 6;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 6;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 6;
+      }
+    }
+  }
+}
+#endif
diff --git a/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc b/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..97cfa0dd63529f3f88d4add7a8b51b410834d316
--- /dev/null
+++ b/lite/tests/cv/anakin/bgr_to_tensor_hwc.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void bgr_to_tensor_hwc(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales) {
+  int size = width * height;
+  float* ptr0 = output.mutable_data<float>();
+  float r_means = means[0];
+  float g_means = means[1];
+  float b_means = means[2];
+  float r_scales = scales[0];
+  float g_scales = scales[1];
+  float b_scales = scales[2];
+
+  int w = width;
+  int dim8 = w >> 3;
+  int remain = w - (dim8 << 3);
+
+  float32x4_t vrmean = vdupq_n_f32(r_means);
+  float32x4_t vgmean = vdupq_n_f32(g_means);
+  float32x4_t vbmean = vdupq_n_f32(b_means);
+  float32x4_t vrscale = vdupq_n_f32(r_scales);
+  float32x4_t vgscale = vdupq_n_f32(g_scales);
+  float32x4_t vbscale = vdupq_n_f32(b_scales);
+
+  for (int i = 0; i < height; i++) {
+    const uint8_t* ptr_bgr = bgr + i * width * 3;
+    float* ptr0_b = ptr0 + i * width;
+    float* ptr1_g = ptr0_b + size;
+    float* ptr2_r = ptr1_g + size;
+
+    for (int j = 0; j < dim8; j++) {
+      uint8x8x3_t vbgr = vld3_u8(ptr_bgr);
+      uint8x8_t vb = vbgr.val[0];
+      uint8x8_t vg = vbgr.val[1];
+      uint8x8_t vr = vbgr.val[2];
+
+      uint16x8_t vb_16 = vmovl_u8(vb);
+      uint16x8_t vg_16 = vmovl_u8(vg);
+      uint16x8_t vr_16 = vmovl_u8(vr);
+
+      uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
+      uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
+      uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
+
+      uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
+      uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
+      uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
+
+      float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
+      float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
+      float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
+
+      float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
+      float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
+      float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
+
+      vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
+      vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
+      vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
+
+      vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
+      vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
+      vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
+
+      vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
+      vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
+      vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
+
+      vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
+      vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
+      vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
+
+      vst1q_f32(ptr0_b, vb_low_f32);
+      vst1q_f32(ptr1_g, vg_low_f32);
+      vst1q_f32(ptr2_r, vr_low_f32);
+
+      ptr_bgr += 24;
+
+      vst1q_f32(ptr0_b + 4, vb_high_f32);
+      vst1q_f32(ptr1_g + 4, vg_high_f32);
+      vst1q_f32(ptr2_r + 4, vr_high_f32);
+
+      ptr0_b += 8;
+      ptr1_g += 8;
+      ptr2_r += 8;
+    }
+
+    for (int j = 0; j < remain; j++) {
+      *ptr0_b++ = (*ptr_bgr - b_means) * b_scales;  // NOLINT
+      ptr_bgr++;
+      *ptr1_g++ = (*ptr_bgr - g_means) * g_scales;  // NOLINT
+      ptr_bgr++;
+      *ptr2_r++ = (*ptr_bgr - r_means) * r_scales;  // NOLINT
+      ptr_bgr++;
+    }
+  }
+}
diff --git a/lite/tests/cv/anakin/bgra_flip_hwc.cc b/lite/tests/cv/anakin/bgra_flip_hwc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7227d0a6891bd7c098c1cb383b90d979fa08d531
--- /dev/null
+++ b/lite/tests/cv/anakin/bgra_flip_hwc.cc
@@ -0,0 +1,1168 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgra_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int flip_num) {
+  if (flip_num == 1) {  // x
+    flip_x_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (flip_num == -1) {  // y
+    flip_y_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (flip_num == 0) {  // xy
+    flip_xy_hwc_bgra(src, dst, w_in, h_in);
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr8 bgr9
+bgr4 bgr5 bgr6
+bgr1 bgr2 bgr3
+*/
+#ifdef __aarch64__
+void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int h = h_in - 1;
+  int win = w_in * 4;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "ld4  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32    \n"  // v0={00,01,02,
+                                                                       // 03,
+                                                                       // 04,
+                                                                       // 05,
+                                                                       // 06,
+                                                                       // 07}"
+          "ld4  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32     \n"  // v0={10,11,12,
+                                                                        // 13,
+                                                                        // 14,
+                                                                        // 15,
+                                                                        // 16,
+                                                                        // 17}"
+          "ld4  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32    \n"  // v0={20,21,22,
+                                                                         // 23,
+                                                                         // 24,
+                                                                         // 25,
+                                                                         // 26,
+                                                                         // 27}"
+          "ld4  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32    \n"  // v0={30,31,32,
+          // 33,
+          // 34,
+          // 35,
+          // 36,
+          // 37}"
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32  \n"  // 00 10 20
+                                                                     // 30 04 14
+                                                                     // 24 34
+          "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr1]], #32            \n"  // 02 12 22 32
+          "st4 {v8.8b, v9.8b, v10.8b, v11.8b}, [%[outptr2]], #32             \n"  // 01 11 21 31
+          "st4 {v12.8b, v13.8b, v14.8b, v15.8b}, [%[outptr3]], #32             "
+          " \n"  // 03 13 23 33
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#else
+void flip_x_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  // uint8_t zerobuff[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+  int win = w_in * 4;
+  uint8_t zerobuff[win];  // NOLINT
+  memset(zerobuff, 0, win * sizeof(uint8_t));
+  uint8_t zerobuff2[win];  // NOLINT
+  memset(zerobuff2, 0, win * sizeof(uint8_t));
+  int h = h_in - 1;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+
+    uint8_t* outptr0 = dst + (h - i) * win;  // last
+    uint8_t* outptr1 = outptr0 - win;
+    uint8_t* outptr2 = outptr1 - win;
+    uint8_t* outptr3 = outptr2 - win;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w_in - 7; j += 8) {
+      asm volatile(
+          "vld4.8  {d0, d1, d2, d3}, [%[inptr0]]!   @ zip load r0, d0 =00 01 "
+          "02 03 04 05 06 07\n"
+          "vld4.8  {d4, d5, d6, d7}, [%[inptr1]]!   @ zip load r1, d2 =10 11 "
+          "12 13 14 15 16 17\n"
+          "vld4.8  {d8, d9, d10, d11}, [%[inptr2]]!   @ zip load r1, d4 =20 21 "
+          "22 23 24 25 26 27\n"
+          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
+          "31 32 33 34 35 36 37\n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst4.8  {d0, d1, d2, d3},    [%[outptr0]]!   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst4.8  {d4, d5, d6, d7},    [%[outptr1]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d8, d9, d10, d11},    [%[outptr2]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d12, d13, d14, d15},    [%[outptr3]]!   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3)
+          :
+          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+    }
+    for (; j < w_in; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr3 bgr2 bgr1
+bgr6 bgr5 bgr4
+bgr9 bgr8 bgr7
+*/
+#ifdef __aarch64__
+void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  // uint8_t zerobuff[24] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, \
+    //                         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 32;
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld4  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32    \n"  // v0={00,01,02,
+                                                                       // 03,
+                                                                       // 04,
+                                                                       // 05,
+                                                                       // 06,
+                                                                       // 07}"
+          "ld4  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32     \n"  // v0={10,11,12,
+                                                                        // 13,
+                                                                        // 14,
+                                                                        // 15,
+                                                                        // 16,
+                                                                        // 17}"
+          "ld4  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32    \n"  // v0={20,21,22,
+                                                                         // 23,
+                                                                         // 24,
+                                                                         // 25,
+                                                                         // 26,
+                                                                         // 27}"
+          "ld4  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32    \n"  // v0={30,31,32,
+          // 33,
+          // 34,
+          // 35,
+          // 36,
+          // 37}"
+
+          "rev64  v16.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v17.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v18.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v19.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v20.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v22.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v23.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v0.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v1.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v2.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v3.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v4.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v5.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v6.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
+          "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
+          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
+                                                                           // 11
+                                                                           // 21
+                                                                           // 31
+          "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]]              \n"  // 03 13 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#else
+void flip_y_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 32;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (i + 1) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 + w_in;
+    uint8_t* outptr2 = outptr1 + w_in;
+    uint8_t* outptr3 = outptr2 + w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    int j = 0;
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld4.8  {d0, d1, d2, d3}, [%[inptr0]]!   @ zip load r0, d0 =00 01 "
+          "02 03 04 05 06 07\n"
+          "vld4.8  {d4, d5, d6, d7}, [%[inptr1]]!   @ zip load r1, d2 =10 11 "
+          "12 13 14 15 16 17\n"
+          "vld4.8  {d8, d9, d10, d11}, [%[inptr2]]!   @ zip load r1, d4 =20 21 "
+          "22 23 24 25 26 27\n"
+          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
+          "31 32 33 34 35 36 37\n"
+
+          "vrev64.8  d16, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d20, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d23, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d0, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d1, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d2, d10               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d3, d11               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d4, d12               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d5, d13               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d6, d14               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst4.8  {d20, d21, d22, d23},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d0, d1, d2, d3},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d4, d5, d6, d7},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#endif
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+flip:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+#ifdef __aarch64__
+void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int stride_w = 32;
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last col
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "ld4  {v0.8b, v1.8b, v2.8b, v3.8b}, [%[inptr0]], #32    \n"  // v0={00,01,02,
+                                                                       // 03,
+                                                                       // 04,
+                                                                       // 05,
+                                                                       // 06,
+                                                                       // 07}"
+          "ld4  {v4.8b, v5.8b, v6.8b, v7.8b}, [%[inptr1]], #32     \n"  // v0={10,11,12,
+                                                                        // 13,
+                                                                        // 14,
+                                                                        // 15,
+                                                                        // 16,
+                                                                        // 17}"
+          "ld4  {v8.8b, v9.8b, v10.8b, v11.8b}, [%[inptr2]], #32    \n"  // v0={20,21,22,
+                                                                         // 23,
+                                                                         // 24,
+                                                                         // 25,
+                                                                         // 26,
+                                                                         // 27}"
+          "ld4  {v12.8b, v13.8b, v14.8b, v15.8b}, [%[inptr3]], #32    \n"  // v0={30,31,32,
+          // 33,
+          // 34,
+          // 35,
+          // 36,
+          // 37}"
+
+          "rev64  v16.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 b
+          "rev64  v17.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 g
+          "rev64  v18.8b, v2.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00 r
+          "rev64  v19.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v20.8b, v4.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v21.8b, v5.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v22.8b, v6.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v23.8b, v7.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v0.8b, v8.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v1.8b, v9.8b                \n"  //@ reverse 07 06 05 04 03 02
+                                                   // 01 00
+          "rev64  v2.8b, v10.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v3.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "rev64  v4.8b, v12.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v5.8b, v13.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v6.8b, v14.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+          "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
+                                                    // 02 01 00
+
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
+
+          "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
+          "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
+          "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
+                                                                           // 11
+                                                                           // 21
+                                                                           // 31
+          "st4 {v4.8b, v5.8b, v6.8b, v7.8b}, [%[outptr3]]              \n"  // 03 13 23 33
+
+          "sub %[outptr0], %[outptr0], %[stride_w]       \n"  //@ ptr - stride_w
+          "sub %[outptr1], %[outptr1], %[stride_w]       \n"
+          "sub %[outptr2], %[outptr2], %[stride_w]       \n"
+          "sub %[outptr3], %[outptr3], %[stride_w]       \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20",
+            "v21",
+            "v22",
+            "v23");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#else
+void flip_xy_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  uint8_t zerobuff2[w_in];  // NOLINT
+  memset(zerobuff2, 0, w_in * sizeof(uint8_t));
+  int stride_w = 32;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+          outptr0 = zerobuff2;
+        case 2:
+          inptr1 = zerobuff;
+          outptr1 = zerobuff2;
+        case 1:
+          inptr2 = zerobuff;
+          outptr2 = zerobuff2;
+        case 0:
+          inptr3 = zerobuff;
+          outptr3 = zerobuff2;
+        default:
+          break;
+      }
+    }
+    int j = 0;
+    for (; j < w - 7; j += 8) {
+      asm volatile(
+          "vld4.8  {d0, d1, d2, d3}, [%[inptr0]]!   @ zip load r0, d0 =00 01 "
+          "02 03 04 05 06 07\n"
+          "vld4.8  {d4, d5, d6, d7}, [%[inptr1]]!   @ zip load r1, d2 =10 11 "
+          "12 13 14 15 16 17\n"
+          "vld4.8  {d8, d9, d10, d11}, [%[inptr2]]!   @ zip load r1, d4 =20 21 "
+          "22 23 24 25 26 27\n"
+          "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
+          "31 32 33 34 35 36 37\n"
+
+          "vrev64.8  d16, d0               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d17, d1               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d18, d2               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d19, d3               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d20, d4               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d21, d5               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d22, d6               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d23, d7               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d0, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d1, d9               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d2, d10               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d3, d11               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "vrev64.8  d4, d12               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d5, d13               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d6, d14               @ reverse 07 06 05 04 03 02 01 00 \n"
+          "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 \n"
+
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
+
+          "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
+          "d0(q0,low),r00,r10 20 30\n"
+          "vst4.8  {d20, d21, d22, d23},    [%[outptr1]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d0, d1, d2, d3},    [%[outptr2]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+          "vst4.8  {d4, d5, d6, d7},    [%[outptr3]]   @ write "
+          "d4(q0,low),r01,r11 21 31\n"
+
+          "sub %[outptr0], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr1], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr2], %[stride_w]       @ ptr - stride_w \n"
+          "sub %[outptr3], %[stride_w]       @ ptr - stride_w \n"
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr0] "+r"(outptr0),
+            [outptr1] "+r"(outptr1),
+            [outptr2] "+r"(outptr2),
+            [outptr3] "+r"(outptr3),
+            [stride_w] "+r"(stride_w)
+          :
+          : "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12");
+    }
+    outptr3 += stride_w - 4;
+    outptr2 += stride_w - 4;
+    outptr1 += stride_w - 4;
+    outptr0 += stride_w - 4;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
+#endif
diff --git a/lite/tests/cv/anakin/bgra_resize.cc b/lite/tests/cv/anakin/bgra_resize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bdfbd3a94510c507009f7061a8ff83c237788c7c
--- /dev/null
+++ b/lite/tests/cv/anakin/bgra_resize.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void resize_four_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void bgra_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(char) * w_in * h_in * 4);
+    return;
+  }
+  // y
+  resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
+}
+void resize_four_channel(const uint8_t* src,
+                         int w_in,
+                         int h_in,
+                         uint8_t* dst,
+                         int w_out,
+                         int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_int16_t(X)                                             \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 4; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 4;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_int16_t(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_int16_t(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_int16_t(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_int16_t(b1);
+  }
+#undef SATURATE_CAST_int16_t
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
diff --git a/lite/tests/cv/anakin/bgra_rotate_hwc.cc b/lite/tests/cv/anakin/bgra_rotate_hwc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aaad9671f84f94d31e1cb4a1bd9bca88f033e5cb
--- /dev/null
+++ b/lite/tests/cv/anakin/bgra_rotate_hwc.cc
@@ -0,0 +1,452 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
+
+void bgra_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int w_in, int h_in, int angle) {
+  if (angle == 90) {
+    rotate90_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (angle == 270) {
+    rotate270_hwc_bgra(src, dst, w_in, h_in);
+  }
+  if (angle == 180) {
+    rotate180_hwc_bgra(src, dst, w_in, h_in);
+  }
+}
+
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr7 bgr4 bgr1
+bgr8 bgr5 bgr2
+bgr9 bgr6 bgr3
+*/
+void rotate90_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 4;
+  int wout = w_out * 4;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 32;
+  int ww = w_out - 8;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (i = 0; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        "prfm   pldl1keep, [%[ptr4]]        \n"
+        "prfm   pldl1keep, [%[ptr4], #64]   \n"
+        "prfm   pldl1keep, [%[ptr5]]        \n"
+        "prfm   pldl1keep, [%[ptr5], #64]   \n"
+        "prfm   pldl1keep, [%[ptr6]]        \n"
+        "prfm   pldl1keep, [%[ptr6], #64]   \n"
+        "prfm   pldl1keep, [%[ptr7]]        \n"
+        "prfm   pldl1keep, [%[ptr7], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < w_in; j++) {
+      int tmpx = (ww - i) * 4;
+      uint8_t* outptr = dst + j * wout + tmpx;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+    }
+  }
+  ww = w_out - 1;
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + j * wout + (ww - i) * 4;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr3 bgr6 bgr9
+bgr2 bgr5 bgr8
+bgr1 bgr4 bgr7
+*/
+// dst = (h_out - 1) * w_out
+// 类似rotate90，将输出结果倒着输出 或者先rotate90,然后沿Y轴翻转
+void rotate270_hwc_bgra(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
+  int w_out = h_in;
+  int h_out = w_in;
+  int win = w_in * 4;
+  int wout = w_out * 4;
+  int hremain = h_in % 8;
+  int stride_h = 4 * win;
+  int stride_h_w = 4 * win - 32;
+  int hout = h_out - 1;
+  // block 8*8. -- 8*8
+  int i = 0;
+  for (; i < h_in - 7; i += 8) {
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    const uint8_t* inptr4 = inptr3 + win;
+    const uint8_t* inptr5 = inptr4 + win;
+    const uint8_t* inptr6 = inptr5 + win;
+    const uint8_t* inptr7 = inptr6 + win;
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        "prfm   pldl1keep, [%[ptr4]]        \n"
+        "prfm   pldl1keep, [%[ptr4], #64]   \n"
+        "prfm   pldl1keep, [%[ptr5]]        \n"
+        "prfm   pldl1keep, [%[ptr5], #64]   \n"
+        "prfm   pldl1keep, [%[ptr6]]        \n"
+        "prfm   pldl1keep, [%[ptr6], #64]   \n"
+        "prfm   pldl1keep, [%[ptr7]]        \n"
+        "prfm   pldl1keep, [%[ptr7], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr4]]            @ preload a, 64byte\n"
+        "pld [%[ptr4], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr5]]            @ preload a, 64byte\n"
+        "pld [%[ptr5], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr6]]            @ preload a, 64byte\n"
+        "pld [%[ptr6], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr7]]            @ preload a, 64byte\n"
+        "pld [%[ptr7], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3),
+          [ptr4] "r"(inptr4),
+          [ptr5] "r"(inptr5),
+          [ptr6] "r"(inptr6),
+          [ptr7] "r"(inptr7)
+        : "memory");
+#endif
+    for (; j < w_in; j++) {
+      int tmpx = i * 4;
+      uint8_t* outptr = dst + (hout - j) * wout + tmpx;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+      *outptr++ = *inptr0++;
+
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+      *outptr++ = *inptr1++;
+
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+      *outptr++ = *inptr2++;
+
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+      *outptr++ = *inptr3++;
+
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+      *outptr++ = *inptr4++;
+
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+      *outptr++ = *inptr5++;
+
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+      *outptr++ = *inptr6++;
+
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+      *outptr++ = *inptr7++;
+    }
+  }
+
+  for (; i < h_in; i++) {
+    const uint8_t* inptr0 = src + i * win;
+    for (int j = 0; j < w_in; j++) {
+      uint8_t* outptr0 = dst + (hout - j) * wout + i * 4;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+      *outptr0++ = *inptr0++;
+    }
+  }
+}
+/*
+bgr1 bgr2 bgr3
+bgr4 bgr5 bgr6
+bgr7 bgr8 bgr9
+rotate:
+bgr9 bgr8 bgr7
+bgr6 bgr5 bgr4
+bgr3 bgr2 bgr1
+*/
+// filp y
+void rotate180_hwc_bgra(const uint8_t* src, uint8_t* dst, int w, int h_in) {
+  int w_in = w * 4;
+  uint8_t zerobuff[w_in];  // NOLINT
+  memset(zerobuff, 0, w_in * sizeof(uint8_t));
+  int stride_w = 4;
+  // 4*8
+  for (int i = 0; i < h_in; i += 4) {
+    const uint8_t* inptr0 = src + i * w_in;
+    const uint8_t* inptr1 = inptr0 + w_in;
+    const uint8_t* inptr2 = inptr1 + w_in;
+    const uint8_t* inptr3 = inptr2 + w_in;
+
+    uint8_t* outptr0 = dst + (h_in - i) * w_in - stride_w;  // last
+    uint8_t* outptr1 = outptr0 - w_in;
+    uint8_t* outptr2 = outptr1 - w_in;
+    uint8_t* outptr3 = outptr2 - w_in;
+
+    if (i + 3 >= h_in) {
+      switch ((i + 3) - h_in) {
+        case 3:
+          inptr0 = zerobuff;
+        case 2:
+          inptr1 = zerobuff;
+        case 1:
+          inptr2 = zerobuff;
+        case 0:
+          inptr3 = zerobuff;
+        default:
+          break;
+      }
+    }
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]                \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr0]]                         @ preload a, 64byte\n"
+        "pld [%[ptr0], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr1]]            @ preload a, 64byte\n"
+        "pld [%[ptr1], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr2]]            @ preload a, 64byte\n"
+        "pld [%[ptr2], #64]            @ preload a, 64byte\n"
+        "pld [%[ptr3]]            @ preload a, 64byte\n"
+        "pld [%[ptr3], #64]            @ preload a, 64byte\n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < w; j++) {
+      if (i + 3 >= h_in) {
+        switch ((i + 3) - h_in) {
+          case 0:
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            *outptr2++ = *inptr2++;
+            outptr2 -= 8;
+          case 1:
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            *outptr1++ = *inptr1++;
+            outptr1 -= 8;
+          case 2:
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            *outptr0++ = *inptr0++;
+            outptr0 -= 8;
+          case 3:
+          // inptr3 = zerobuff;
+          default:
+            break;
+        }
+      } else {
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        *outptr3++ = *inptr3++;
+        outptr3 -= 8;
+
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        *outptr2++ = *inptr2++;
+        outptr2 -= 8;
+
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        *outptr1++ = *inptr1++;
+        outptr1 -= 8;
+
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        *outptr0++ = *inptr0++;
+        outptr0 -= 8;
+      }
+    }
+  }
+}
diff --git a/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc b/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e24f87a1d8bbddf00f898185b71b8bd312f902c
--- /dev/null
+++ b/lite/tests/cv/anakin/bgra_to_tensor_hwc.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void bgra_to_tensor_hwc(const uint8_t* bgr,
+                        Tensor& output,  // NOLINT
+                        int width,
+                        int height,
+                        float* means,
+                        float* scales) {
+  int size = width * height;
+  float* ptr0 = output.mutable_data<float>();
+  float r_means = means[0];
+  float g_means = means[1];
+  float b_means = means[2];
+  float r_scales = scales[0];
+  float g_scales = scales[1];
+  float b_scales = scales[2];
+
+  int dim8 = width >> 3;
+  int remain = width - (dim8 << 3);
+
+  float32x4_t vrmean = vdupq_n_f32(r_means);
+  float32x4_t vgmean = vdupq_n_f32(g_means);
+  float32x4_t vbmean = vdupq_n_f32(b_means);
+  float32x4_t vrscale = vdupq_n_f32(r_scales);
+  float32x4_t vgscale = vdupq_n_f32(g_scales);
+  float32x4_t vbscale = vdupq_n_f32(b_scales);
+
+  for (int i = 0; i < height; i++) {
+    const uint8_t* ptr_bgr = bgr + i * width * 4;
+    float* ptr0_b = ptr0 + i * width;
+    float* ptr1_g = ptr0_b + size;
+    float* ptr2_r = ptr1_g + size;
+
+    for (int j = 0; j < dim8; j++) {
+      uint8x8x4_t vbgr = vld4_u8(ptr_bgr);
+      uint8x8_t vb = vbgr.val[0];
+      uint8x8_t vg = vbgr.val[1];
+      uint8x8_t vr = vbgr.val[2];
+
+      uint16x8_t vb_16 = vmovl_u8(vb);
+      uint16x8_t vg_16 = vmovl_u8(vg);
+      uint16x8_t vr_16 = vmovl_u8(vr);
+
+      uint32x4_t vb_low_32 = vmovl_u16(vget_low_u16(vb_16));
+      uint32x4_t vg_low_32 = vmovl_u16(vget_low_u16(vg_16));
+      uint32x4_t vr_low_32 = vmovl_u16(vget_low_u16(vr_16));
+
+      uint32x4_t vb_high_32 = vmovl_u16(vget_high_u16(vb_16));
+      uint32x4_t vg_high_32 = vmovl_u16(vget_high_u16(vg_16));
+      uint32x4_t vr_high_32 = vmovl_u16(vget_high_u16(vr_16));
+
+      float32x4_t vb_low_f32 = vcvtq_f32_u32(vb_low_32);
+      float32x4_t vr_low_f32 = vcvtq_f32_u32(vr_low_32);
+      float32x4_t vg_low_f32 = vcvtq_f32_u32(vg_low_32);
+
+      float32x4_t vb_high_f32 = vcvtq_f32_u32(vb_high_32);
+      float32x4_t vg_high_f32 = vcvtq_f32_u32(vg_high_32);
+      float32x4_t vr_high_f32 = vcvtq_f32_u32(vr_high_32);
+
+      vb_low_f32 = vsubq_f32(vb_low_f32, vbmean);
+      vg_low_f32 = vsubq_f32(vg_low_f32, vgmean);
+      vr_low_f32 = vsubq_f32(vr_low_f32, vrmean);
+
+      vb_high_f32 = vsubq_f32(vb_high_f32, vbmean);
+      vg_high_f32 = vsubq_f32(vg_high_f32, vgmean);
+      vr_high_f32 = vsubq_f32(vr_high_f32, vrmean);
+
+      vb_low_f32 = vmulq_f32(vb_low_f32, vbscale);
+      vg_low_f32 = vmulq_f32(vg_low_f32, vgscale);
+      vr_low_f32 = vmulq_f32(vr_low_f32, vrscale);
+
+      vb_high_f32 = vmulq_f32(vb_high_f32, vbscale);
+      vg_high_f32 = vmulq_f32(vg_high_f32, vgscale);
+      vr_high_f32 = vmulq_f32(vr_high_f32, vrscale);
+
+      vst1q_f32(ptr0_b, vb_low_f32);
+      vst1q_f32(ptr1_g, vg_low_f32);
+      vst1q_f32(ptr2_r, vr_low_f32);
+
+      ptr_bgr += 32;
+
+      vst1q_f32(ptr0_b + 4, vb_high_f32);
+      vst1q_f32(ptr1_g + 4, vg_high_f32);
+      vst1q_f32(ptr2_r + 4, vr_high_f32);
+
+      ptr0_b += 8;
+      ptr1_g += 8;
+      ptr2_r += 8;
+    }
+
+    for (int j = 0; j < remain; j++) {
+      *ptr0_b++ = (*ptr_bgr - b_means) * b_scales;
+      ptr_bgr++;
+      *ptr1_g++ = (*ptr_bgr - g_means) * g_scales;
+      ptr_bgr++;
+      *ptr2_r++ = (*ptr_bgr - r_means) * r_scales;
+      ptr_bgr++;
+      ptr_bgr++;
+    }
+  }
+}
diff --git a/lite/tests/cv/anakin/cv_utils.cc b/lite/tests/cv/anakin/cv_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2e436f6f3c9feab637802c40b426abd851cd994c
--- /dev/null
+++ b/lite/tests/cv/anakin/cv_utils.cc
@@ -0,0 +1,143 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void image_basic_convert(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         ImageFormat dstFormat,
+                         int srcw,
+                         int srch,
+                         int out_size) {
+  if (srcFormat == dstFormat) {
+    // copy
+    memcpy(dst, src, sizeof(uint8_t) * out_size);
+    return;
+  } else {
+    if (srcFormat == ImageFormat::NV12 &&
+        (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB)) {
+      nv12_to_bgr(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV21 &&
+               (dstFormat == ImageFormat::BGR ||
+                dstFormat == ImageFormat::RGB)) {
+      nv21_to_bgr(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV12 &&
+               (dstFormat == ImageFormat::BGRA ||
+                dstFormat == ImageFormat::RGBA)) {
+      nv12_to_bgra(src, dst, srcw, srch);
+    } else if (srcFormat == ImageFormat::NV21 &&
+               (dstFormat == ImageFormat::BGRA ||
+                dstFormat == ImageFormat::RGBA)) {
+      nv21_to_bgra(src, dst, srcw, srch);
+    } else {
+      printf("bais-anakin srcFormat: %d, dstFormat: %d does not support! \n",
+             srcFormat,
+             dstFormat);
+    }
+  }
+}
+
+void image_basic_resize(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        int dstw,
+                        int dsth) {
+  int size = srcw * srch;
+  if (srcw == dstw && srch == dsth) {
+    if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+      size = srcw * (static_cast<int>(1.5 * srch));
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      size = 3 * srcw * srch;
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      size = 4 * srcw * srch;
+    }
+    memcpy(dst, src, sizeof(uint8_t) * size);
+    return;
+  } else {
+    if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+      nv21_resize(src, dst, srcw, srch, dstw, dsth);
+    } else if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+      bgr_resize(src, dst, srcw, srch, dstw, dsth);
+    } else if (srcFormat == ImageFormat::BGRA ||
+               srcFormat == ImageFormat::RGBA) {
+      bgra_resize(src, dst, srcw, srch, dstw, dsth);
+    } else {
+      printf("anakin doesn't support this type: %d\n",
+             static_cast<int>(srcFormat));
+    }
+  }
+}
+
+void image_basic_flip(const uint8_t* src,
+                      uint8_t* dst,
+                      ImageFormat srcFormat,
+                      int srcw,
+                      int srch,
+                      int flip_num) {
+  if (flip_num == -1) {
+    flip_num = 0;  // xy
+  } else if (flip_num == 0) {
+    flip_num = 1;  // x
+  } else if (flip_num == 1) {
+    flip_num = -1;  // y
+  }
+  if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    bgr_flip_hwc(src, dst, srcw, srch, flip_num);
+  } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
+    bgra_flip_hwc(src, dst, srcw, srch, flip_num);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
+
+void image_basic_rotate(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        float rotate_num) {
+  if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+    bgr_rotate_hwc(src, dst, srcw, srch, rotate_num);
+  } else if (srcFormat == ImageFormat::BGRA || srcFormat == ImageFormat::RGBA) {
+    bgra_rotate_hwc(src, dst, srcw, srch, rotate_num);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
+
+void image_basic_to_tensor(const uint8_t* in_data,
+                           Tensor dst,
+                           ImageFormat srcFormat,
+                           LayoutType layout,
+                           int srcw,
+                           int srch,
+                           float* means,
+                           float* scales) {
+  if (layout == LayoutType::kNCHW &&
+      (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB)) {
+    bgr_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
+  } else if (layout == LayoutType::kNCHW && (srcFormat == ImageFormat::BGRA ||
+                                             srcFormat == ImageFormat::RGBA)) {
+    bgra_to_tensor_hwc(in_data, dst, srcw, srch, means, scales);
+  } else {
+    printf("anakin doesn't support this type: %d\n",
+           static_cast<int>(srcFormat));
+  }
+}
diff --git a/lite/tests/cv/anakin/cv_utils.h b/lite/tests/cv/anakin/cv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..915bada5e68292934364eec30ad3c4c38014384d
--- /dev/null
+++ b/lite/tests/cv/anakin/cv_utils.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <arm_neon.h>
+#include "lite/core/tensor.h"
+#include "lite/utils/cv/paddle_image_preprocess.h"
+
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+
+void rotate(const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
+
+void bgr_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
+
+void bgra_rotate_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int angle);
+
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void flip(const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
+
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgr_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
+// x: flip_num = 1 y: flip_num = -1 xy: flip_num = 0;
+void bgra_flip_hwc(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int flip_num);
+
+// y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_resize(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
+
+void bgr_resize(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
+
+void bgra_resize(
+    const uint8_t* src, uint8_t* dst, int srcw, int srch, int dstw, int dsth);
+
+// nv21(yvu)  to BGR: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+
+// nv12(yuv)  to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+
+// nv21(yvu)  to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+
+// nv12(yuv)  to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+
+// bgr output.w == width output.h == height/3
+void bgr_to_tensor_hcw(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales);
+
+// bgr output.w == width / 3 output.h == height
+void bgr_to_tensor_hwc(const uint8_t* bgr,
+                       Tensor& output,  // NOLINT
+                       int width,
+                       int height,
+                       float* means,
+                       float* scales);
+
+// bgra output.w == width / 4 output.h == height
+void bgra_to_tensor_hwc(const uint8_t* bgr,
+                        Tensor& output,  // NOLINT
+                        int width,
+                        int height,
+                        float* means,
+                        float* scales);
+
+// yvu   y_w = width, y_h = height uv_w = width uv_h = 1/2 * height
+void nv21_to_tensor(const uint8_t* nv21,
+                    Tensor& output,  // NOLINT
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+
+// yuv  y_w = width, y_h = height uv_w = width uv_h = 1/2 * height
+void nv12_to_tensor(const uint8_t* nv12,
+                    Tensor& output,  // NOLINT
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+
+// clang-format on
+void image_basic_convert(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         ImageFormat dstFormat,
+                         int srcw,
+                         int srch,
+                         int out_size);
+
+void image_basic_resize(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        int dstw,
+                        int dsth);
+
+void image_basic_flip(const uint8_t* src,
+                      uint8_t* dst,
+                      ImageFormat srcFormat,
+                      int srcw,
+                      int srch,
+                      int flip_num);
+
+void image_basic_rotate(const uint8_t* src,
+                        uint8_t* dst,
+                        ImageFormat srcFormat,
+                        int srcw,
+                        int srch,
+                        float rotate_num);
+
+void image_basic_to_tensor(const uint8_t* in_data,
+                           Tensor dst,
+                           ImageFormat srcFormat,
+                           LayoutType layout,
+                           int srcw,
+                           int srch,
+                           float* means,
+                           float* scales);
diff --git a/lite/tests/cv/anakin/nv12_to_bgr.cc b/lite/tests/cv/anakin/nv12_to_bgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fb9af2152bc1f6ec82bb33369a673daea683210
--- /dev/null
+++ b/lite/tests/cv/anakin/nv12_to_bgr.cc
@@ -0,0 +1,359 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+B = Y + 1.772*(U-128);
+float: a*b = ((a << 7)*b )>>7
+
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yuv  store hwc bgrbgr dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgr(const unsigned char* src,
+                 unsigned char* dst,
+                 int srcw,
+                 int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int wout = srcw * 3;
+
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8_t* zerobuf = new uint8_t[srcw];
+  uint8_t* writebuf = new uint8_t[wout];
+  memset(zerobuf, 0, sizeof(uint8_t) * srcw);
+
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+    if (i + 2 > y_h) {
+      ptr_y2 = zerobuf;
+      ptr_bgr2 = writebuf;
+    }
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+
+      g0 = vmlaq_s16(g0, gb, v_bias);
+
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+
+      vst3_u8(ptr_bgr1, v_bgr);
+
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+
+      vst3_u8(ptr_bgr1, v_bgr1);
+
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      ptr_bgr1 += 24;
+
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[1];
+      unsigned char _u = ptr_vu[0];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+}
diff --git a/lite/tests/cv/anakin/nv12_to_bgra.cc b/lite/tests/cv/anakin/nv12_to_bgra.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70b15ae14c9d9b122750c66c1bd4cd7cd047080c
--- /dev/null
+++ b/lite/tests/cv/anakin/nv12_to_bgra.cc
@@ -0,0 +1,362 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+B = Y + 1.772*(U-128);
+A = 255
+float compute a*b = ((a << 7)*b )>>7
+
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yuv  store hwc bgrabgra dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv12_to_bgra(const unsigned char* src,
+                  unsigned char* dst,
+                  int srcw,
+                  int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int wout = srcw * 4;
+
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8x8_t a_8 = vdup_n_u8(255);
+
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+
+      g0 = vmlaq_s16(g0, gb, v_bias);
+
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      uint8x8x4_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr.val[3] = a_8;
+
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+
+      vst4_u8(ptr_bgr1, v_bgr);
+
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      ptr_bgr1 += 32;
+      uint8x8x4_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      v_bgr1.val[3] = a_8;
+
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+
+      vst4_u8(ptr_bgr1, v_bgr1);
+
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      ptr_bgr1 += 32;
+
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      vst4_u8(ptr_bgr2, v_bgr);
+      vst4_u8(ptr_bgr2 + 32, v_bgr1);
+
+      ptr_bgr2 += 64;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[1];
+      unsigned char _u = ptr_vu[0];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      *ptr_bgr1++ = 255;
+
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr1++ = 255;
+
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      *ptr_bgr2++ = 255;
+
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+      *ptr_bgr2++ = 255;
+    }
+  }
+}
diff --git a/lite/tests/cv/anakin/nv21_resize.cc b/lite/tests/cv/anakin/nv21_resize.cc
new file mode 100644
index 0000000000000000000000000000000000000000..14084f809c90a997a9c36fc46b3c11f58dff38b6
--- /dev/null
+++ b/lite/tests/cv/anakin/nv21_resize.cc
@@ -0,0 +1,486 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits.h>
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+void resize_one_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void resize_one_channel_uv(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+void nv21_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    printf("nv21_resize equal \n");
+    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
+    return;
+  }
+  int y_h = h_in;
+  int uv_h = h_in / 2;
+  const uint8_t* y_ptr = src;
+  const uint8_t* uv_ptr = src + y_h * w_in;
+  // out
+  int dst_y_h = h_out;
+  int dst_uv_h = h_out / 2;
+  uint8_t* dst_ptr = dst + dst_y_h * w_out;
+
+  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
+  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
+}
+
+void resize_one_channel(const uint8_t* src,
+                        int w_in,
+                        int h_in,
+                        uint8_t* dst,
+                        int w_out,
+                        int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+
+  int* buf = new int[w_out * 2 + h_out * 2];
+
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0;
+  int sy = 0;
+
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+
+    xofs[dx] = sx;
+
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+
+    yofs[dy] = sy;
+
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S1p = S1 + sx;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    }
+
+    prev_sy1 = sy + 1;
+
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+
+// #pragma omp parallel for
+
+#if 1  // __aarch64__
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+      vst1_u8(dp_ptr, _dout);
+
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+#else
+#pragma omp parallel for
+    if (cnt > 0) {
+      asm volatile(
+          "mov        r4, #2          \n"
+          "vdup.s32   q12, r4         \n"
+          "0:                         \n"
+          "pld        [%[rows0p], #128]      \n"
+          "pld        [%[rows1p], #128]      \n"
+          "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
+          "vld1.s16   {d6-d7}, [%[rows0p]]!\n"
+          "pld        [%[rows0p], #128]      \n"
+          "pld        [%[rows1p], #128]      \n"
+          "vmull.s16  q0, d2, %[_b0]     \n"
+          "vmull.s16  q1, d3, %[_b0]     \n"
+          "vmull.s16  q2, d6, %[_b1]     \n"
+          "vmull.s16  q3, d7, %[_b1]     \n"
+
+          "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
+          "vld1.s16   {d6-d7}, [%[rows0p]]!\n"
+
+          "vorr.s32   q10, q12, q12   \n"
+          "vorr.s32   q11, q12, q12   \n"
+          "vsra.s32   q10, q0, #16    \n"
+          "vsra.s32   q11, q1, #16    \n"
+          "vsra.s32   q10, q2, #16    \n"
+          "vsra.s32   q11, q3, #16    \n"
+
+          "vmull.s16  q0, d2, %[_b0]     \n"
+          "vmull.s16  q1, d3, %[_b0]     \n"
+          "vmull.s16  q2, d6, %[_b1]     \n"
+          "vmull.s16  q3, d7, %[_b1]     \n"
+
+          "vsra.s32   q10, q0, #16    \n"
+          "vsra.s32   q11, q1, #16    \n"
+          "vsra.s32   q10, q2, #16    \n"
+          "vsra.s32   q11, q3, #16    \n"
+
+          "vshrn.s32  d20, q10, #2    \n"
+          "vshrn.s32  d21, q11, #2    \n"
+          "vqmovun.s16 d20, q10        \n"
+          "vst1.8     {d20}, [%[dp]]!    \n"
+          "subs       %[cnt], #1          \n"
+          "bne        0b              \n"
+          "sub        %[rows0p], #16         \n"
+          "sub        %[rows1p], #16         \n"
+          : [rows0p] "+r"(rows0p),
+            [rows1p] "+r"(rows1p),
+            [_b0] "+w"(_b0),
+            [_b1] "+w"(_b1),
+            [cnt] "+r"(cnt),
+            [dp] "+r"(dp_ptr)
+          :
+          : "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12");
+    }
+#endif  // __aarch64__
+    for (; remain; --remain) {
+      //             D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
+      //             INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+
+    ibeta += 2;
+  }
+
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
+
+void resize_one_channel_uv(const uint8_t* src,
+                           int w_in,
+                           int h_in,
+                           uint8_t* dst,
+                           int w_out,
+                           int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+
+  int* buf = new int[w_out * 2 + h_out * 2];
+
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
+
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 2; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+
+    xofs[dx] = sx;
+
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+
+    yofs[dy] = sy;
+
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 2;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
+      int32x4_t _acc = _v2;
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
+      vst1_u8(dp_ptr, _dout);
+
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+
+    ibeta += 2;
+  }
+
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
diff --git a/lite/tests/cv/anakin/nv21_to_bgr.cc b/lite/tests/cv/anakin/nv21_to_bgr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fcdbc3660341be246c36af2908241b38b1a516d
--- /dev/null
+++ b/lite/tests/cv/anakin/nv21_to_bgr.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+float compute: a*b = ((a << 7)*b )>>7
+
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yvu  store hwc bgrbgr dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgr(const unsigned char* src,
+                 unsigned char* dst,
+                 int srcw,
+                 int srch) {
+  int y_h = srch;
+  int wout = srcw * 3;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+
+      g0 = vmlaq_s16(g0, gb, v_bias);
+
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+
+      vst3_u8(ptr_bgr1, v_bgr);
+
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+
+      vst3_u8(ptr_bgr1, v_bgr1);
+
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      ptr_bgr1 += 24;
+
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[0];
+      unsigned char _u = ptr_vu[1];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+}
diff --git a/lite/tests/cv/anakin/nv21_to_bgra.cc b/lite/tests/cv/anakin/nv21_to_bgra.cc
new file mode 100644
index 0000000000000000000000000000000000000000..394b1512a77725b37c8f4e9b484c59e7934bbe87
--- /dev/null
+++ b/lite/tests/cv/anakin/nv21_to_bgra.cc
@@ -0,0 +1,362 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <math.h>
+#include "lite/tests/cv/anakin/cv_utils.h"
+
+/*
+R = Y + 1.402*(V-128);
+G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+B = Y + 1.772*(U-128);
+A = 255
+float-compute: a*b = ((a << 7)*b )>>7
+
+ra = 1.402 *128 = 179.456 = 179
+ga = 0.34414 * 64 = 44.3721 = 44
+gb = 0.71414 * 64 = 91.40992 = 91
+ba = 1.772 * 62 = 226.816 = 227
+*/
+// yvu  store hwc bgrabgra dsth * dstw = srch * srcw y_w = srcw
+// y_h = srch uv_w = srcw uv_h = 1/2 * srch
+void nv21_to_bgra(const unsigned char* src,
+                  unsigned char* dst,
+                  int srcw,
+                  int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const unsigned char* y = src;
+  const unsigned char* vu = src + y_h * srcw;
+  int wout = srcw * 4;
+
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8x8_t a_8 = vdup_n_u8(255);
+
+  for (int i = 0; i < y_h; i += 2) {
+    const unsigned char* ptr_y1 = y + i * srcw;
+    const unsigned char* ptr_y2 = ptr_y1 + srcw;
+    const unsigned char* ptr_vu = vu + (i / 2) * srcw;
+    unsigned char* ptr_bgr1 = dst + i * wout;
+    unsigned char* ptr_bgr2 = ptr_bgr1 + wout;
+// 2*16
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    int j = 0;
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+
+      g0 = vmlaq_s16(g0, gb, v_bias);
+
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      uint8x8x4_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+      v_bgr.val[3] = a_8;
+
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+
+      vst4_u8(ptr_bgr1, v_bgr);
+
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      ptr_bgr1 += 32;
+      uint8x8x4_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+      v_bgr1.val[3] = a_8;
+
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+
+      vst4_u8(ptr_bgr1, v_bgr1);
+
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      ptr_bgr1 += 32;
+
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      vst4_u8(ptr_bgr2, v_bgr);
+      vst4_u8(ptr_bgr2 + 32, v_bgr1);
+
+      ptr_bgr2 += 64;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      unsigned char _y0 = ptr_y1[0];
+      unsigned char _y1 = ptr_y1[1];
+      unsigned char _v = ptr_vu[0];
+      unsigned char _u = ptr_vu[1];
+      unsigned char _y0_1 = ptr_y2[0];
+      unsigned char _y1_1 = ptr_y2[1];
+
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+      *ptr_bgr1++ = 255;
+
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+      *ptr_bgr1++ = 255;
+
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+      *ptr_bgr2++ = 255;
+
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+      *ptr_bgr2++ = 255;
+    }
+  }
+}
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index b1302f3396fa17471d4252e27897ec44c0110342..ee2bda1226ee264090dbacbd28c173d83a30e412 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -293,53 +293,53 @@ void test_img(const std::vector<int>& cluster_id,
 
         // LOG(INFO) << "image convert saber compute";
         t_convert.Start();
-        // 方法一: image_preprocess.imageCovert(src, lite_dst);
-        image_preprocess.imageConvert(
+        // 方法一: image_preprocess.image_convert(src, lite_dst);
+        image_preprocess.image_convert(
             src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
         t_convert.Stop();
 
         // LOG(INFO) << "image resize saber compute";
         t_resize.Start();
-        // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp);
-        image_preprocess.imageResize(lite_dst,
-                                     resize_tmp,
-                                     (ImageFormat)dstFormat,
-                                     srcw,
-                                     srch,
-                                     dstw,
-                                     dsth);
+        // 方法一:image_preprocess.image_resize(lite_dst, resize_tmp);
+        image_preprocess.image_resize(lite_dst,
+                                      resize_tmp,
+                                      (ImageFormat)dstFormat,
+                                      srcw,
+                                      srch,
+                                      dstw,
+                                      dsth);
         t_resize.Stop();
 
         // LOG(INFO) << "image rotate saber compute";
         t_rotate.Start();
-        // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
-        image_preprocess.imageRotate(resize_tmp,
-                                     tv_out_ratote,
-                                     (ImageFormat)dstFormat,
-                                     dstw,
-                                     dsth,
-                                     rotate);
+        // 方法一: image_preprocess.image_rotate(resize_tmp, tv_out_ratote);
+        image_preprocess.image_rotate(resize_tmp,
+                                      tv_out_ratote,
+                                      (ImageFormat)dstFormat,
+                                      dstw,
+                                      dsth,
+                                      rotate);
         t_rotate.Stop();
 
         // LOG(INFO) << "image flip saber compute";
         t_flip.Start();
-        // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip);
-        image_preprocess.imageFlip(
+        // 方法一: image_preprocess.image_flip(resize_tmp, tv_out_flip);
+        image_preprocess.image_flip(
             resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip);
         t_flip.Stop();
 
         // LOG(INFO) << "image to tensor compute";
         t_tensor.Start();
-        // 方法一: image_preprocess.image2Tensor(
+        // 方法一: image_preprocess.image_to_tensor(
         //  resize_tmp, &dst_tensor, layout, means, scales);
-        image_preprocess.image2Tensor(resize_tmp,
-                                      &dst_tensor,
-                                      (ImageFormat)dstFormat,
-                                      dstw,
-                                      dsth,
-                                      layout,
-                                      means,
-                                      scales);
+        image_preprocess.image_to_tensor(resize_tmp,
+                                         &dst_tensor,
+                                         (ImageFormat)dstFormat,
+                                         dstw,
+                                         dsth,
+                                         layout,
+                                         means,
+                                         scales);
         t_tensor.Stop();
         t1.Stop();
       }
@@ -680,7 +680,7 @@ void test_rotate(const std::vector<int>& cluster_id,
 
       for (int i = 0; i < test_iter; ++i) {
         t_rotate.Start();
-        image_preprocess.imageRotate(src, lite_dst);
+        image_preprocess.image_rotate(src, lite_dst);
         t_rotate.Stop();
       }
       LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg()
@@ -847,7 +847,7 @@ void test_flip(const std::vector<int>& cluster_id,
 
       for (int i = 0; i < test_iter; ++i) {
         t_rotate.Start();
-        image_preprocess.imageFlip(src, lite_dst);
+        image_preprocess.image_flip(src, lite_dst);
         t_rotate.Stop();
       }
       LOG(INFO) << "image flip avg time : " << t_rotate.LapTimes().Avg()
@@ -1016,7 +1016,7 @@ void test_resize(const std::vector<int>& cluster_id,
 
       for (int i = 0; i < test_iter; ++i) {
         t_rotate.Start();
-        image_preprocess.imageResize(src, lite_dst);
+        image_preprocess.image_resize(src, lite_dst);
         t_rotate.Stop();
       }
       LOG(INFO) << "image Resize avg time : " << t_rotate.LapTimes().Avg()
@@ -1191,7 +1191,7 @@ void test_convert(const std::vector<int>& cluster_id,
 
       for (int i = 0; i < test_iter; ++i) {
         t_rotate.Start();
-        image_preprocess.imageConvert(src, lite_dst);
+        image_preprocess.image_convert(src, lite_dst);
         t_rotate.Stop();
       }
       LOG(INFO) << "image Convert avg time : " << t_rotate.LapTimes().Avg()
diff --git a/lite/tests/cv/image_profiler_test.cc b/lite/tests/cv/image_profiler_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..074f2e6ce8744e5c3563e4e7e56f3694f7ac5576
--- /dev/null
+++ b/lite/tests/cv/image_profiler_test.cc
@@ -0,0 +1,1089 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <math.h>
+#include <random>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/tests/cv/anakin/cv_utils.h"
+#include "lite/tests/utils/tensor_utils.h"
+#include "lite/utils/cv/paddle_image_preprocess.h"
+#include "time.h"  // NOLINT
+DEFINE_int32(cluster, 3, "cluster id");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 10, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(srcFormat, 12, "input image format NV12");
+DEFINE_int32(dstFormat, 3, "output image format BGR");
+DEFINE_int32(srch, 1920, "input height");
+DEFINE_int32(srcw, 1080, "input width");
+DEFINE_int32(dsth, 960, "output height");
+DEFINE_int32(dstw, 540, "output width");
+DEFINE_int32(angle, 90, "rotate angel");
+DEFINE_int32(flip_num, 0, "flip x");
+DEFINE_int32(layout, 1, "layout nchw");
+
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+typedef paddle::lite::utils::cv::TransParam TransParam;
+typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+typedef paddle::lite_api::Tensor Tensor_api;
+typedef paddle::lite::Tensor Tensor;
+
+using paddle::lite::profile::Timer;
+
+void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
+  uint seed = 256;
+  for (int64_t i = 0; i < size; ++i) {
+    dio[i] = rand_r(&seed) % 256;  // -128;
+  }
+}
+
+void print_int8(uint8_t* ptr, int size, int width) {
+  for (int i = 0; i < size; i++) {
+    printf("%d  ", *ptr++);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+}
+
+void print_int(int* ptr, int size, int width) {
+  int j = 0;
+  for (int i = 0; i < size; i++) {
+    printf("%d  ", *ptr++);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+}
+
+void print_fp32(const float* ptr, int size, int width) {
+  int j = 0;
+  for (int i = 0; i < size; i++) {
+    printf("%f  ", *ptr++);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
+  }
+  printf("\n");
+}
+#ifdef LITE_WITH_ARM
+void test_convert(const std::vector<int>& cluster_id,
+                  const std::vector<int>& thread_num,
+                  int srcw,
+                  int srch,
+                  int dstw,
+                  int dsth,
+                  ImageFormat srcFormat,
+                  ImageFormat dstFormat,
+                  float rotate,
+                  FlipParam flip,
+                  LayoutType layout,
+                  int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      Timer t_basic, t_lite;
+      LOG(INFO) << "basic Convert compute";
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_convert(src,
+                            basic_dst,
+                            (ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            srcw,
+                            srch,
+                            out_size);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc Convert avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+
+      LOG(INFO) << "lite Convert compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.image_convert(src, lite_dst);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image Convert avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+      LOG(INFO) << "basic Convert compute";
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image convert size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image convert end";
+    }
+  }
+}
+
+void test_resize(const std::vector<int>& cluster_id,
+                 const std::vector<int>& thread_num,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 float rotate,
+                 FlipParam flip,
+                 LayoutType layout,
+                 int test_iter = 10) {
+  test_iter = 1;
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = dsth * dstw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * dsth) * dstw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = dsth * dstw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      Timer t_rotate;
+      Timer t_basic, t_lite;
+      LOG(INFO) << "baisc resize compute";
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_resize(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, dstw, dsth);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc Resize avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+
+      LOG(INFO) << "lite resize compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_rotate.Start();
+        image_preprocess.image_resize(src, lite_dst);
+        t_rotate.Stop();
+      }
+      LOG(INFO) << "image Resize avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image Resize size: " << out_size;
+        int* diff_v = new int[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          int diff1 = a - b;  // basic resize and saber resize 在float ->
+          // int转换时存在误差，误差范围是{-1, 1}
+          int diff = 0;
+          if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
+          diff_v[i] = diff;
+          if (diff > 1 && max_diff < diff) {
+            max_diff = diff;
+            printf("i: %d, lite: %d, basic: %d \n", i, a, b);
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srcw;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / dstw;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image Resize end";
+    }
+  }
+}
+
+void test_flip(const std::vector<int>& cluster_id,
+               const std::vector<int>& thread_num,
+               int srcw,
+               int srch,
+               int dstw,
+               int dsth,
+               ImageFormat srcFormat,
+               ImageFormat dstFormat,
+               float rotate,
+               FlipParam flip,
+               LayoutType layout,
+               int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      LOG(INFO) << "basic flip compute";
+      Timer t_basic, t_lite;
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_flip(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, flip);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc flip avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+
+      LOG(INFO) << "lite flip compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.image_flip(src, lite_dst);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image flip avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image flip size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image flip end";
+    }
+  }
+}
+
+void test_rotate(const std::vector<int>& cluster_id,
+                 const std::vector<int>& thread_num,
+                 int srcw,
+                 int srch,
+                 int dstw,
+                 int dsth,
+                 ImageFormat srcFormat,
+                 ImageFormat dstFormat,
+                 float rotate,
+                 FlipParam flip,
+                 LayoutType layout,
+                 int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+      }
+      uint8_t* basic_dst = new uint8_t[out_size];
+      uint8_t* lite_dst = new uint8_t[out_size];
+      LOG(INFO) << "basic rotate compute";
+      Timer t_basic, t_lite;
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_rotate(
+            src, basic_dst, (ImageFormat)dstFormat, srcw, srch, rotate);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc rotate avg time : " << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+
+      LOG(INFO) << "lite rotate compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = srch;
+      tparam.ow = srcw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.image_rotate(src, lite_dst);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image rotate avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        LOG(INFO) << "diff, image rotate size: " << out_size;
+        uint8_t* diff_v = new uint8_t[out_size];
+        for (int i = 0; i < out_size; i++) {
+          uint8_t a = lite_dst[i];
+          uint8_t b = basic_dst[i];
+          uint8_t diff1 = a - b;
+          uint8_t diff = diff1 > 0 ? diff1 : -diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = size / srch;
+          printf("din: \n");
+          print_int8(src, size, width);
+          width = out_size / srch;
+          printf("saber result: \n");
+          print_int8(lite_dst, out_size, width);
+          printf("basic result: \n");
+          print_int8(basic_dst, out_size, width);
+          printf("diff result: \n");
+          print_int8(diff_v, out_size, width);
+        }
+        delete[] diff_v;
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+      }
+      LOG(INFO) << "image rotate end";
+    }
+  }
+}
+
+void test_to_tensor(const std::vector<int>& cluster_id,
+                    const std::vector<int>& thread_num,
+                    int srcw,
+                    int srch,
+                    int dstw,
+                    int dsth,
+                    ImageFormat srcFormat,
+                    ImageFormat dstFormat,
+                    float rotate,
+                    FlipParam flip,
+                    LayoutType layout,
+                    int test_iter = 10) {
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      LOG(INFO) << "cluster: " << cls << ", threads: " << th;
+
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::NV12 || srcFormat == ImageFormat::NV21) {
+        size = ceil(1.5 * srch) * srcw;
+      } else if (srcFormat == ImageFormat::BGRA ||
+                 srcFormat == ImageFormat::RGBA) {
+        size = 4 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = new uint8_t[size];
+      fill_tensor_host_rand(src, size);
+
+      int out_size = srch * srcw;
+      int resize = dstw * dsth;
+      if (dstFormat == ImageFormat::NV12 || dstFormat == ImageFormat::NV21) {
+        out_size = ceil(1.5 * srch) * srcw;
+        resize = ceil(1.5 * dsth) * dstw;
+      } else if (dstFormat == ImageFormat::BGR ||
+                 dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+        resize = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::BGRA ||
+                 dstFormat == ImageFormat::RGBA) {
+        out_size = 4 * srch * srcw;
+        resize = 4 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+        resize = dsth * dstw;
+      }
+      // out
+      std::vector<int64_t> shape_out = {1, 3, dsth, dstw};
+
+      Tensor tensor;
+      Tensor tensor_basic;
+      tensor.Resize(shape_out);
+      tensor_basic.Resize(shape_out);
+      tensor.set_precision(PRECISION(kFloat));
+      tensor_basic.set_precision(PRECISION(kFloat));
+
+      float means[3] = {127.5f, 127.5f, 127.5f};
+      float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
+
+      Timer t_basic, t_lite;
+      LOG(INFO) << "basic to tensor compute: ";
+      for (int i = 0; i < test_iter; i++) {
+        t_basic.Start();
+        image_basic_to_tensor(src,
+                              tensor_basic,
+                              (ImageFormat)dstFormat,
+                              layout,
+                              dstw,
+                              dsth,
+                              means,
+                              scales);
+        t_basic.Stop();
+      }
+      LOG(INFO) << "image baisc to_tensor avg time : "
+                << t_basic.LapTimes().Avg()
+                << ", min time: " << t_basic.LapTimes().Min()
+                << ", max time: " << t_basic.LapTimes().Max();
+
+      LOG(INFO) << "lite to_tensor compute";
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      Tensor_api dst_tensor(&tensor);
+      dst_tensor.Resize(shape_out);
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        t_lite.Start();
+        image_preprocess.image_to_tensor(src,
+                                         &dst_tensor,
+                                         (ImageFormat)dstFormat,
+                                         dstw,
+                                         dsth,
+                                         layout,
+                                         means,
+                                         scales);
+        t_lite.Stop();
+      }
+      LOG(INFO) << "image tensor avg time : " << t_lite.LapTimes().Avg()
+                << ", min time: " << t_lite.LapTimes().Min()
+                << ", max time: " << t_lite.LapTimes().Max();
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      if (FLAGS_check_result) {
+        max_ratio = 0;
+        max_diff = 0;
+        LOG(INFO) << "diff, iamge to tensor size: " << tensor.numel();
+        const float* ptr_a = tensor.data<float>();
+        const float* ptr_b = tensor_basic.data<float>();
+        int ss = tensor.numel();
+        float* diff_v = new float[ss];
+        for (int i = 0; i < ss; i++) {
+          int a = ptr_a[i];
+          int b = ptr_b[i];
+          int diff1 = a - b;
+          int diff = 0;
+          if (diff1 < -1 || diff1 > 1) diff = diff1 < 0 ? -diff1 : diff1;
+          diff_v[i] = diff;
+          if (max_diff < diff) {
+            max_diff = diff;
+            max_ratio = 2.0 * max_diff / (a + b + eps);
+          }
+        }
+        if (std::abs(max_ratio) >= 1e-5f) {
+          int width = resize / srch;
+          printf("din: \n");
+          print_int8(src, resize, width);
+          printf("saber result: \n");
+          print_fp32(ptr_a, resize, width);
+          printf("basic result: \n");
+          print_fp32(ptr_b, resize, width);
+          printf("diff result: \n");
+          print_fp32(diff_v, resize, width);
+        }
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        bool rst = std::abs(max_ratio) < 1e-5f;
+        CHECK_EQ(rst, true) << "compute result error";
+        LOG(INFO) << "iamge to tensor end";
+      }
+    }
+  }
+}
+
+void print_info(ImageFormat srcFormat,
+                ImageFormat dstFormat,
+                int srcw,
+                int srch,
+                int dstw,
+                int dsth,
+                float rotate_num,
+                int flip_num,
+                int layout) {
+  paddle::lite::DeviceInfo::Init();
+  LOG(INFO) << " input tensor size, num= " << 1 << ", channel= " << 1
+            << ", height= " << srch << ", width= " << srcw
+            << ", srcFormat= " << (ImageFormat)srcFormat;
+  // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+  if (srcFormat == ImageFormat::NV21) {
+    LOG(INFO) << "srcFormat: NV21";
+  }
+  if (srcFormat == ImageFormat::NV12) {
+    LOG(INFO) << "srcFormat: NV12";
+  }
+  if (srcFormat == ImageFormat::GRAY) {
+    LOG(INFO) << "srcFormat: GRAY";
+  }
+  if (srcFormat == ImageFormat::BGRA) {
+    LOG(INFO) << "srcFormat: BGRA";
+  }
+  if (srcFormat == ImageFormat::BGR) {
+    LOG(INFO) << "srcFormat: BGR";
+  }
+  if (srcFormat == ImageFormat::RGBA) {
+    LOG(INFO) << "srcFormat: RGBA";
+  }
+  if (srcFormat == ImageFormat::RGB) {
+    LOG(INFO) << "srcFormat: RGB";
+  }
+  LOG(INFO) << " output tensor size, num=" << 1 << ", channel=" << 1
+            << ", height=" << dsth << ", width=" << dstw
+            << ", dstFormat= " << (ImageFormat)dstFormat;
+
+  if (dstFormat == ImageFormat::NV21) {
+    LOG(INFO) << "dstFormat: NV21";
+  }
+  if (dstFormat == ImageFormat::NV12) {
+    LOG(INFO) << "dstFormat: NV12";
+  }
+  if (dstFormat == ImageFormat::GRAY) {
+    LOG(INFO) << "dstFormat: GRAY";
+  }
+  if (dstFormat == ImageFormat::BGRA) {
+    LOG(INFO) << "dstFormat: BGRA";
+  }
+  if (dstFormat == ImageFormat::BGR) {
+    LOG(INFO) << "dstFormat: BGR";
+  }
+  if (dstFormat == ImageFormat::RGBA) {
+    LOG(INFO) << "dstFormat: RGBA";
+  }
+  if (dstFormat == ImageFormat::RGB) {
+    LOG(INFO) << "dstFormat: RGB";
+  }
+  LOG(INFO) << "Rotate = " << rotate_num;
+  if (flip_num == -1) {
+    LOG(INFO) << "Flip XY";
+  } else if (flip_num == 0) {
+    LOG(INFO) << "Flip X";
+  } else if (flip_num == 1) {
+    LOG(INFO) << "Flip Y";
+  }
+  if (layout == 1) {
+    LOG(INFO) << "Layout NCHW";
+  } else if (layout == 3) {
+    LOG(INFO) << "Layout NHWC";
+  }
+}
+#if 0
+TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 4, 16, 112, 224}) {
+        for (auto rotate : {180}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {12}) {
+              for (auto dstFormat : {0, 1, 2, 3}) {
+                for (auto layout : {1}) {
+                  // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12
+                  if ((srcFormat == ImageFormat::RGB ||
+                      srcFormat == ImageFormat::BGR) &&
+                      (dstFormat == ImageFormat::RGBA ||
+                       dstFormat == ImageFormat::BGRA)) {
+                    continue;  // anakin is not suupport
+                  }
+                  print_info((ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            w,
+                            h,
+                            w,
+                            h,
+                            rotate,
+                            flip,
+                            layout);
+                  test_convert({FLAGS_cluster},
+                               {1},
+                               w,
+                               h,
+                               w,
+                               h,
+                               (ImageFormat)srcFormat,
+                               (ImageFormat)dstFormat,
+                               rotate,
+                               (FlipParam)flip,
+                               (LayoutType)layout,
+                               FLAGS_repeats);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 0
+TEST(TestImageResizeRand, test_func_image_resize_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {8, 16, 112, 224, 1092}) {
+      for (auto h : {4, 16, 112, 224}) {
+        for (auto ww : {8, 32, 112}) {
+          for (auto hh : {8, 112}) {
+            for (auto rotate : {180}) {
+              for (auto flip : {0}) {
+                for (auto srcFormat : {0, 1, 2, 3, 11, 12}) {
+                  for (auto layout : {1}) {
+                    auto dstFormat = srcFormat;
+                    print_info((ImageFormat)srcFormat,
+                                (ImageFormat)dstFormat,
+                                w,
+                                h,
+                                ww,
+                                hh,
+                                rotate,
+                                flip,
+                                layout);
+                    test_resize({FLAGS_cluster},
+                                {1},
+                                w,
+                                h,
+                                ww,
+                                hh,
+                                (ImageFormat)srcFormat,
+                                (ImageFormat)dstFormat,
+                                rotate,
+                                (FlipParam)flip,
+                                (LayoutType)layout,
+                                FLAGS_repeats);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageFlipRand, test_func_image_flip_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 16, 112, 224}) {
+        for (auto rotate : {90}) {
+          for (auto flip : {-1, 0, 1}) {
+            for (auto srcFormat : {0, 1, 2, 3}) {
+              for (auto layout : {1}) {
+                auto dstFormat = srcFormat;
+                print_info((ImageFormat)srcFormat,
+                           (ImageFormat)dstFormat,
+                           w,
+                           h,
+                           w,
+                           h,
+                           rotate,
+                           flip,
+                           layout);
+                test_flip({FLAGS_cluster},
+                          {1},
+                          w,
+                          h,
+                          w,
+                          h,
+                          (ImageFormat)srcFormat,
+                          (ImageFormat)dstFormat,
+                          rotate,
+                          (FlipParam)flip,
+                          (LayoutType)layout,
+                          FLAGS_repeats);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageRotateRand, test_func_image_rotate_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 16, 112, 224}) {
+        for (auto rotate : {90, 180, 270}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {0, 1, 2, 3}) {
+              for (auto layout : {1}) {
+                auto dstFormat = srcFormat;
+                print_info((ImageFormat)srcFormat,
+                           (ImageFormat)dstFormat,
+                           w,
+                           h,
+                           w,
+                           h,
+                           rotate,
+                           flip,
+                           layout);
+                test_rotate({FLAGS_cluster},
+                            {1},
+                            w,
+                            h,
+                            w,
+                            h,
+                            (ImageFormat)srcFormat,
+                            (ImageFormat)dstFormat,
+                            rotate,
+                            (FlipParam)flip,
+                            (LayoutType)layout,
+                            FLAGS_repeats);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageToTensorRand, test_func_image_to_tensor_preprocess) {
+  if (FLAGS_basic_test) {
+    for (auto w : {1, 8, 16, 112, 224, 1092}) {
+      for (auto h : {1, 16, 112, 224}) {
+        for (auto rotate : {90}) {
+          for (auto flip : {0}) {
+            for (auto srcFormat : {0, 1, 2, 3}) {
+              for (auto layout : {1}) {
+                auto dstFormat = srcFormat;
+                print_info((ImageFormat)srcFormat,
+                           (ImageFormat)dstFormat,
+                           w,
+                           h,
+                           w,
+                           h,
+                           rotate,
+                           flip,
+                           layout);
+                test_to_tensor({FLAGS_cluster},
+                               {1},
+                               w,
+                               h,
+                               w,
+                               h,
+                               (ImageFormat)srcFormat,
+                               (ImageFormat)dstFormat,
+                               rotate,
+                               (FlipParam)flip,
+                               (LayoutType)layout,
+                               FLAGS_repeats);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
+  LOG(INFO) << "print info";
+  print_info((ImageFormat)FLAGS_srcFormat,
+             (ImageFormat)FLAGS_dstFormat,
+             FLAGS_srcw,
+             FLAGS_srch,
+             FLAGS_dstw,
+             FLAGS_dsth,
+             FLAGS_angle,
+             FLAGS_flip_num,
+             FLAGS_layout);
+  test_convert({FLAGS_cluster},
+               {1},
+               FLAGS_srcw,
+               FLAGS_srch,
+               FLAGS_dstw,
+               FLAGS_dsth,
+               (ImageFormat)FLAGS_srcFormat,
+               (ImageFormat)FLAGS_dstFormat,
+               FLAGS_angle,
+               (FlipParam)FLAGS_flip_num,
+               (LayoutType)FLAGS_layout,
+               FLAGS_repeats);
+
+  test_resize({FLAGS_cluster},
+              {1},
+              FLAGS_srcw,
+              FLAGS_srch,
+              FLAGS_dstw,
+              FLAGS_dsth,
+              (ImageFormat)FLAGS_dstFormat,
+              (ImageFormat)FLAGS_dstFormat,
+              FLAGS_angle,
+              (FlipParam)FLAGS_flip_num,
+              (LayoutType)FLAGS_layout,
+              FLAGS_repeats);
+  test_flip({FLAGS_cluster},
+            {1},
+            FLAGS_srcw,
+            FLAGS_srch,
+            FLAGS_dstw,
+            FLAGS_dsth,
+            (ImageFormat)FLAGS_dstFormat,
+            (ImageFormat)FLAGS_dstFormat,
+            FLAGS_angle,
+            (FlipParam)FLAGS_flip_num,
+            (LayoutType)FLAGS_layout,
+            FLAGS_repeats);
+  test_rotate({FLAGS_cluster},
+              {1},
+              FLAGS_srcw,
+              FLAGS_srch,
+              FLAGS_dstw,
+              FLAGS_dsth,
+              (ImageFormat)FLAGS_dstFormat,
+              (ImageFormat)FLAGS_dstFormat,
+              FLAGS_angle,
+              (FlipParam)FLAGS_flip_num,
+              (LayoutType)FLAGS_layout,
+              FLAGS_repeats);
+  test_to_tensor({FLAGS_cluster},
+                 {1},
+                 FLAGS_srcw,
+                 FLAGS_srch,
+                 FLAGS_dstw,
+                 FLAGS_dsth,
+                 (ImageFormat)FLAGS_dstFormat,
+                 (ImageFormat)FLAGS_dstFormat,
+                 FLAGS_angle,
+                 (FlipParam)FLAGS_flip_num,
+                 (LayoutType)FLAGS_layout,
+                 FLAGS_repeats);
+}
+#endif
+#endif
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index a2546c55ed1bb5666ecbfcb6d8da0a1526b20a0a..ad909bef694bfa5a36370abc6869d6a482e4c52b 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -67,6 +67,8 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_pixel_shuffle_compute SRCS pixel_shuffle_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_scatter_compute SRCS scatter_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_as_compute SRCS sequence_expand_as_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
     # for training kernel
     if (LITE_WITH_TRAIN)
diff --git a/lite/tests/kernels/box_coder_compute_test.cc b/lite/tests/kernels/box_coder_compute_test.cc
index 9a833db31db7a6a53a4d29ed208b67e5dc77af12..f59b9dd34f761294c8350df0346f48e52130d2c0 100644
--- a/lite/tests/kernels/box_coder_compute_test.cc
+++ b/lite/tests/kernels/box_coder_compute_test.cc
@@ -195,6 +195,7 @@ void test_box_coder(Place place) {
 TEST(BoxCoder, precision) {
 #ifdef LITE_WITH_X86
   Place place(TARGET(kX86));
+  test_box_coder(place);
 #endif
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
index e0edb3c54e38b2e4387a5886ae6f74facd5752ba..a80bc0d0720f6341a62239ac263b351b46cf3fec 100644
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -130,7 +130,6 @@ void TestCast(Place place, float abs_error, int in_dtype, int out_dtype) {
 }
 
 TEST(Cast, precision) {
-  LOG(INFO) << "test cast op";
   Place place;
   float abs_error = 2e-5;
 #if defined(LITE_WITH_ARM)
@@ -150,7 +149,7 @@ TEST(Cast, precision) {
   TestCast(place, abs_error, 20, 5);
 #endif
   TestCast(place, abs_error, 2, 5);
-#if defined(LITE_WITH_XPU) || defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+#if defined(LITE_WITH_HUAWEI_ASCEND_NPU)
   TestCast(place, abs_error, 3, 5);
   TestCast(place, abs_error, 5, 3);
 #endif
diff --git a/lite/tests/kernels/concat_compute_test.cc b/lite/tests/kernels/concat_compute_test.cc
index 5baa67cd00b36d7712ba1eeca8166204bad1c808..9a778c5d2d82db81ed826bc188bd341b774f5e5f 100644
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
@@ -160,6 +160,9 @@ TEST(Concat, precision) {
 
   for (int axis : {1, 2}) {
     for (bool is_use_axis_tensor : {false, true}) {
+#ifdef LITE_WITH_NPU
+      if (is_use_axis_tensor) continue;
+#endif
       LOG(INFO) << "axis:" << axis
                 << ", is_use_axis_tensor:" << is_use_axis_tensor;
       std::unique_ptr<arena::TestCase> tester(
diff --git a/lite/tests/kernels/fc_compute_test.cc b/lite/tests/kernels/fc_compute_test.cc
index 4ff8e9e3a6edc06b9c62c3883ac86b1b8884e69a..a7ec7e0c2e4c08fad78f59ec588645b78cb45dc0 100644
--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -121,9 +121,9 @@ class FcOPTest : public arena::TestCase {
     int k = wdims_[0];
     int n = wdims_[1];
 
-    LOG(INFO) << "M=" << m << ", N=" << n << ", K=" << k
-              << ", bias=" << flag_bias << ", with_relu=" << with_relu_
-              << ", padding_weights=" << padding_weights_;
+    VLOG(4) << "M=" << m << ", N=" << n << ", K=" << k << ", bias=" << flag_bias
+            << ", with_relu=" << with_relu_
+            << ", padding_weights=" << padding_weights_;
 
     if (m == 1) {
       basic_gemv(n,
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
index 59be5b973a46f17f924b4fb533eabe33534af93e..df9bab9948642e65c587d8e7e66a22c36bb8e089 100644
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace lite {
 
+template <class T = float, class R = int64_t>
 class GatherComputeTest : public arena::TestCase {
  protected:
   // common attributes for this op.
@@ -53,9 +54,9 @@ class GatherComputeTest : public arena::TestCase {
     out_dims[0] = batch_size;
     out->Resize(out_dims);
 
-    auto x_data = x->data<float>();
-    auto index_data = index->data<int>();
-    auto out_data = out->mutable_data<float>();
+    auto x_data = x->template data<T>();
+    auto index_data = index->template data<R>();
+    auto out_data = out->template mutable_data<T>();
 
     auto slice_num = x_dims[0];
     auto slice_size = x_dims.Slice(1, x_dims.size()).production();
@@ -66,7 +67,7 @@ class GatherComputeTest : public arena::TestCase {
       CHECK_GE(index, 0) << "gather ids[i] expected >= 0 but got " << index;
       memcpy(out_data + i * slice_size,
              x_data + index * slice_size,
-             slice_size * sizeof(float));
+             slice_size * sizeof(T));
     }
   }
 
@@ -78,11 +79,12 @@ class GatherComputeTest : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> x(x_dims_.production());
-    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    std::vector<T> x(x_dims_.production());
+    fill_data_rand(
+        x.data(), static_cast<T>(-1), static_cast<T>(1), x_dims_.production());
 
-    std::vector<int32_t> index(index_dims_.production());
-    fill_data_rand<int32_t>(
+    std::vector<R> index(index_dims_.production());
+    fill_data_rand<R>(
         index.data(), 0, x_dims_[0] - 1, index_dims_.production());
 
     SetCommonTensor(x_, x_dims_, x.data());
@@ -90,12 +92,26 @@ class GatherComputeTest : public arena::TestCase {
   }
 };
 
+template <class T = float, class R = int64_t>
+void TestGather(const std::vector<int64_t>& x_dims,
+                const std::vector<int64_t>& index_dims,
+                Place place,
+                float abs_error = 1e-5,
+                const std::string& alias = "def") {
+  std::unique_ptr<arena::TestCase> tester(new GatherComputeTest<T, R>(
+      place, alias, DDim(x_dims), DDim(index_dims)));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
 TEST(Gather, precision) {
-  float abs_error = 2e-5;
+  float abs_error = 1e-5;
   Place place;
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // use fp16 in npu
+  // TODO(zhupengyang): enable later
+  return;
 #elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
   place = TARGET(kHuaweiAscendNPU);
   abs_error = 1e-2;  // precision_mode default is force_fp16
@@ -110,10 +126,14 @@ TEST(Gather, precision) {
   for (auto x_dims :
        std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
     for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) {
-      std::unique_ptr<arena::TestCase> tester(
-          new GatherComputeTest(place, "def", DDim(x_dims), DDim(index_dims)));
-      arena::Arena arena(std::move(tester), place, abs_error);
-      arena.TestPrecision();
+#if defined(LITE_WITH_XPU) || defined(LITE_WITH_NPU)
+      TestGather<float, int>(x_dims, index_dims, place, abs_error, "def");
+#else
+      TestGather<float, int64_t>(x_dims, index_dims, place, abs_error, "int64");
+      TestGather<int64_t, int64_t>(
+          x_dims, index_dims, place, abs_error, "int64");
+      TestGather<float, int>(x_dims, index_dims, place, abs_error, "int32");
+#endif
     }
   }
 }
diff --git a/lite/tests/kernels/interp_compute_test.cc b/lite/tests/kernels/interp_compute_test.cc
index 16bc735f816943e38c03b22c6f04ac5701132191..8d10040bca61f42ffc93d745baf42a23eb11c08d 100644
--- a/lite/tests/kernels/interp_compute_test.cc
+++ b/lite/tests/kernels/interp_compute_test.cc
@@ -416,11 +416,6 @@ void TestInterpAlignMode(Place place, float abs_error = 2e-5) {
   for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
     for (bool align_corners : {true, false}) {
       for (int align_mode : {0, 1}) {
-        // may exist bug in arm kernel
-        if (place == TARGET(kARM) && align_mode == 1 && !align_corners) {
-          continue;
-        }
-        // align_mode = 0 && align_corners = false NOT supported in Huawei
         // Ascend NPU DDK
         if (place == TARGET(kHuaweiAscendNPU) && align_mode == 0 &&
             !align_corners) {
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index 9563a7809198a1d7e3317c7ad2a7effafc3b3f97..a735976f25e5e1370f9b631d9db6a529d378ac41 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -114,6 +114,8 @@ TEST(LookupTable, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;
+  // TODO(zhupengyang): enable later
+  return;
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
@@ -135,10 +137,9 @@ TEST(LookupTable, precision) {
        std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
     for (auto w_dims :
          std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
-#if (defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)) || \
-    defined(LITE_WITH_NPU)
+#if defined(LITE_WITH_XPU) || defined(LITE_WITH_NPU)
       for (auto padding_idx :
-           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU or NPU
+           std::vector<int64_t>{-1}) {  // XPU or NPU only support -1
 #else
       for (auto padding_idx : std::vector<int64_t>{-1, 0, w_dims[0] - 1}) {
 #endif
diff --git a/lite/tests/kernels/prior_box_compute_test.cc b/lite/tests/kernels/prior_box_compute_test.cc
index 73fd612c3a03c0a15ddaf3ce6c08ff0ed1a5a95b..42fe17874787f832aeef5b265710136602ff2a14 100644
--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -738,8 +738,9 @@ TEST(PriorBox, precision) {
 }
 
 TEST(DensityPriorBox, precision) {
-#ifdef LITE_WITH_X86
+#if defined(LITE_WITH_X86) && !defined(LITE_WITH_XPU)
   Place place(TARGET(kX86));
+  test_density_prior_box(place);
 #endif
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
diff --git a/lite/tests/kernels/reduce_sum_compute_test.cc b/lite/tests/kernels/reduce_sum_compute_test.cc
index 18490e2f9e2a8c98c2d54ac599a34d0c42e7d825..c38132a1a084a5e133afdb273ed89680454fa385 100644
--- a/lite/tests/kernels/reduce_sum_compute_test.cc
+++ b/lite/tests/kernels/reduce_sum_compute_test.cc
@@ -340,10 +340,10 @@ TEST(ReduceSum, precision) {
   Place place(TARGET(kX86));
   test_reduce_sum(place);
 #endif
-  // #ifdef LITE_WITH_ARM
-  //  Place place(TARGET(kARM));
-  //  test_reduce_sum(place);
-  // #endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_reduce_sum(place);
+#endif
 }
 
 }  // namespace lite
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
index b08b42e7f19281133d81c1386db2cce84b596605..363d91725887f35adf6fefdd6185d826423281ca 100644
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -162,7 +162,7 @@ TEST(Scale, precision) {
   float abs_error = 2e-5;
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
-  abs_error = 4e-3;  // Using fp16 in NPU
+  abs_error = 1e-1;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
@@ -180,7 +180,7 @@ TEST(Scale, precision) {
   TestScaleShape(place, abs_error);
   TestScaleValue(place, abs_error);
   TestScaleOrder(place, abs_error);
-#ifdef LITE_WITH_ARM
+#if defined(LITE_WITH_ARM) && !defined(LITE_WITH_NPU)
   TestScaleDtype(place, abs_error);
 #endif
 }
diff --git a/lite/tests/kernels/scatter_compute_test.cc b/lite/tests/kernels/scatter_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2d82b38d986deafb619d61e97e20be759c48b98
--- /dev/null
+++ b/lite/tests/kernels/scatter_compute_test.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+void scatter_basic(const int64_t* indexs,
+                   const float* src,
+                   float* dst,
+                   int index_size,
+                   int num,
+                   int size,
+                   bool overwrite) {
+  for (int i = 0; i < num; i++) {
+    const float* din = src + indexs[i] * size;
+    memcpy(dst, din, sizeof(float) * size);
+    dst += size;
+  }
+  if (overwrite) {
+    for (int i = num; i < index_size; i++) {
+      const float* din = src + indexs[i] * size;
+      float* dout = dst + indexs[i] * size;
+      memcpy(dout, din, sizeof(float) * size);
+    }
+  } else {
+    for (int i = num; i < index_size; i++) {
+      const float* din = src + indexs[i] * size;
+      float* dout = dst + indexs[i] * size;
+      for (int j = 0; j < size; j++) {
+        dout[j] += din[j];
+      }
+    }
+  }
+}
+
+class ScatterComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string indexs_ = "indexs";
+  std::string updates_ = "updates";
+  std::string output_ = "out";
+  DDim up_dims_{{1}};
+  DDim id_dims_{{1}};
+  DDim x_dims_{{1}};
+  int index_size_ = 0;
+  bool overwrite_ = false;
+
+ public:
+  ScatterComputeTester(const Place& place,
+                       const std::string& alias,
+                       DDim up_dims,
+                       DDim id_dims,
+                       DDim x_dims,
+                       bool overwrite,
+                       int index_size)
+      : TestCase(place, alias),
+        up_dims_(up_dims),
+        id_dims_(id_dims),
+        x_dims_(x_dims),
+        index_size_(index_size),
+        overwrite_(overwrite) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* indexs_t = scope->FindMutableTensor(indexs_);
+    auto* updates_t = scope->FindMutableTensor(updates_);
+    const auto* indexs_data = indexs_t->data<int64_t>();
+    const auto* updates_data = updates_t->data<float>();
+    auto* out = scope->NewTensor(output_);
+
+    out->Resize(x_dims_);
+
+    auto* out_data = out->mutable_data<float>();
+    int in_n = x_dims_[0];
+    int in_c = x_dims_[1];
+    int in_h = x_dims_[2];
+    int in_w = x_dims_[3];
+    int size = in_c * in_h * in_w;
+
+    scatter_basic(indexs_data,
+                  updates_data,
+                  out_data,
+                  index_size_,
+                  in_n,
+                  size,
+                  overwrite_);
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("scatter");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Ids", {indexs_});
+    op_desc->SetInput("Updates", {updates_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("overwrite", overwrite_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.0;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+    std::vector<float> update(up_dims_.production());
+    for (int i = 0; i < up_dims_.production(); i++) {
+      update[i] = i * 1.0;
+    }
+    SetCommonTensor(updates_, up_dims_, update.data());
+    std::vector<int64_t> index(id_dims_.production());
+    for (int i = 0; i < id_dims_.production(); i++) {
+      index[i] = i;
+    }
+    SetCommonTensor(indexs_, id_dims_, index.data());
+  }
+};
+
+void test_scatter(Place place) {
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 2}) {
+      for (auto h : {1, 3}) {
+        for (auto w : {1, 3}) {
+          for (bool overwrite : {false, true}) {
+            auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+            auto up_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+            auto id_dims = DDim(std::vector<int64_t>({n}));
+            std::unique_ptr<arena::TestCase> tester(new ScatterComputeTester(
+                place, "def", up_dims, id_dims, x_dims, overwrite, n));
+            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena.TestPrecision();
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Scatter, precision) {
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_scatter(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/sequence_expand_as_compute_test.cc b/lite/tests/kernels/sequence_expand_as_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91ee86c00c7b80c11743c5ffd220378b450dcba3
--- /dev/null
+++ b/lite/tests/kernels/sequence_expand_as_compute_test.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#ifdef LITE_WITH_ARM
+#include "lite/kernels/arm/sequence_expand_as_compute.h"
+namespace paddle {
+namespace lite {
+TEST(sequence_expand_as, retrive_op) {
+  auto sequence_expand_as =
+      KernelRegistry::Global().Create("sequence_expand_as");
+  ASSERT_FALSE(sequence_expand_as.empty());
+  ASSERT_TRUE(sequence_expand_as.front());
+}
+
+TEST(sequence_expand_as, init) {
+  paddle::lite::kernels::arm::SequenceExpandAsCompute sequence_expand_as;
+  ASSERT_EQ(sequence_expand_as.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_expand_as.target(), TARGET(kARM));
+}
+
+TEST(sequence_expand_as, run_test) {
+  lite::Tensor x, y, out;
+  std::vector<int64_t> x_shape{4, 1};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> y_shape{1, 5};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> out_shape{8, 1};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto y_data = y.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < y.dims().production(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+
+  std::vector<std::vector<uint64_t>> lod{{0, 3, 3, 1, 1}};
+  y.set_lod(lod);
+  paddle::lite::kernels::arm::SequenceExpandAsCompute sequence_expand_as;
+
+  operators::SequenceExpandAsParam param;
+
+  param.x = &x;
+  param.y = &y;
+  param.out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+
+  sequence_expand_as.SetContext(std::move(ctx));
+  sequence_expand_as.SetParam(param);
+  sequence_expand_as.Run();
+  auto out_data = out.mutable_data<float>();
+
+  int index = 1;
+  auto out_lod = param.out->lod()[0];
+  int lod_sum = out_lod[index];
+  LOG(INFO) << "output: ";
+  for (int i = 0; i < out.dims().production(); i++) {
+    LOG(INFO) << out_data[i];
+    if (i >= lod_sum) {
+      index++;
+      lod_sum = out_lod[index];
+    }
+    ASSERT_EQ(out_data[i], x_data[index - 1]);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_expand_as, kARM, kFloat, kNCHW, def);
+#endif
diff --git a/lite/tests/kernels/topk_compute_test.cc b/lite/tests/kernels/topk_compute_test.cc
index c54d297518cb0438e1851869b58ac060114d6281..37cc549608ef0a911fc2352fddf210a9afcb39ee 100644
--- a/lite/tests/kernels/topk_compute_test.cc
+++ b/lite/tests/kernels/topk_compute_test.cc
@@ -109,6 +109,8 @@ TEST(Topk, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-3;  // Using fp16 in NPU
+  // TODO(zhupengyang): enable later
+  return;
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #else
diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc
index ee297c82f958a213fbf5cdc20c8c1dd9788a66f0..04fb975b99eba7a2ae316d65e3d76dc28268c1f0 100644
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -163,7 +163,9 @@ void TestTranspose4D(Place place, float abs_error) {
 #if !defined(LITE_WITH_XPU)
     {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2},
 #endif
+#if !defined(LITE_WITH_NPU)
         {0, 2, 3, 1}, {0, 3, 1, 2},
+#endif
   };
   for (auto axis : axes) {
     std::unique_ptr<arena::TestCase> tester(
@@ -174,7 +176,6 @@ void TestTranspose4D(Place place, float abs_error) {
 }
 
 TEST(Transpose, precision) {
-  LOG(INFO) << "test Transpose op";
   float abs_error = 2e-5;
   Place place;
 #if defined(LITE_WITH_NPU)
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index 57899c8d1e2e0c073f410e90d18119327f21f066..39a2808c4ae98e9362eeeac27a791506e7959bd3 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -104,11 +104,11 @@ bool test_gemm_int8(bool tra,
     scale_merge_int8[j] = scale_merge_fp32[j] / scale_c[0];
   }
 
-  LOG(INFO) << "gemm_int8 M: " << m << ", N: " << n << ", K: " << k
-            << ", transA: " << (tra ? "true" : "false")
-            << ", transB: " << (trb ? "true" : "false")
-            << ", relu: " << (has_relu ? "true" : "false")
-            << ", bias: " << (has_bias ? "true" : "false");
+  VLOG(4) << "gemm_int8 M: " << m << ", N: " << n << ", K: " << k
+          << ", transA: " << (tra ? "true" : "false")
+          << ", transB: " << (trb ? "true" : "false")
+          << ", relu: " << (has_relu ? "true" : "false")
+          << ", bias: " << (has_bias ? "true" : "false");
 #ifdef LITE_WITH_ARM
   int lda = tra ? m : k;
   int ldb = trb ? k : n;
@@ -344,13 +344,12 @@ TEST(TestLiteGemmInt8, gemm_prepacked_int8) {
                                                FLAGS_power_mode,
                                                th);
                     if (flag) {
-                      LOG(INFO) << "test m = " << m << ", n=" << n
-                                << ", k=" << k
-                                << ", bias: " << (has_bias ? "true" : "false")
-                                << ", relu: " << (has_relu ? "true" : "false")
-                                << ", trans A: " << (tra ? "true" : "false")
-                                << ", trans B: " << (trb ? "true" : "false")
-                                << " passed\n";
+                      VLOG(4) << "test m = " << m << ", n=" << n << ", k=" << k
+                              << ", bias: " << (has_bias ? "true" : "false")
+                              << ", relu: " << (has_relu ? "true" : "false")
+                              << ", trans A: " << (tra ? "true" : "false")
+                              << ", trans B: " << (trb ? "true" : "false")
+                              << " passed\n";
                     } else {
                       LOG(FATAL) << "test m = " << m << ", n=" << n
                                  << ", k=" << k
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index 3819c0dcd7f87c69a5805aae643a6a3a4a037f03..e06cccc3bc4a66a6a9d34fb83d86aebaf2421e3f 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -97,9 +97,9 @@ bool test_gemv_int8(bool tra,
     scale_merge_int8[j] = scale_merge_fp32[j] / scale_c[0];
   }
 
-  LOG(INFO) << "gemv_int8 M: " << m << ", N: " << n
-            << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
-            << ", bias: " << (has_bias ? "true" : "false");
+  VLOG(4) << "gemv_int8 M: " << m << ", N: " << n
+          << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
+          << ", bias: " << (has_bias ? "true" : "false");
 #ifdef LITE_WITH_ARM
   auto da = ta.mutable_data<int8_t>();
   auto db = tb.mutable_data<int8_t>();
@@ -336,11 +336,11 @@ TEST(TestLiteGemvInt8, gemv_prepacked_int8) {
                                            six,
                                            alpha);
                 if (flag) {
-                  LOG(INFO) << "test m = " << m << ", n=" << n
-                            << ", bias: " << (has_bias ? "true" : "false")
-                            << ",  relu: " << (has_relu ? "true" : "false")
-                            << ", trans A: " << (tra ? "true" : "false")
-                            << " passed\n";
+                  VLOG(4) << "test m = " << m << ", n=" << n
+                          << ", bias: " << (has_bias ? "true" : "false")
+                          << ",  relu: " << (has_relu ? "true" : "false")
+                          << ", trans A: " << (tra ? "true" : "false")
+                          << " passed\n";
                 } else {
                   LOG(FATAL) << "test m = " << m << ", n=" << n
                              << ", bias: " << (has_bias ? "true" : "false")
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
index ecdf77fd37fff1da2914eeca5e29ef931de09c53..ccbaa90ad91255443f824835cc0a14de57e74bc5 100644
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -98,9 +98,9 @@ bool test_sgemm_c4(
   basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
   basic_trans_mat_to_c4(db, db_c4, n, k, n, false);
 
-  LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
-            << ", relu: " << (has_relu ? "true" : "false")
-            << ", bias: " << (has_bias ? "true" : "false");
+  VLOG(4) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
+          << ", relu: " << (has_relu ? "true" : "false")
+          << ", bias: " << (has_bias ? "true" : "false");
 
   if (FLAGS_check_result) {
     basic_gemm_c4(false,
@@ -331,10 +331,10 @@ TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
                 auto flag = test_sgemm_c4(
                     m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
                 if (flag) {
-                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
-                            << ", bias: " << (has_bias ? "true" : "false")
-                            << ", relu: " << (has_relu ? "true" : "false")
-                            << " passed\n";
+                  VLOG(4) << "test m = " << m << ", n=" << n << ", k=" << k
+                          << ", bias: " << (has_bias ? "true" : "false")
+                          << ", relu: " << (has_relu ? "true" : "false")
+                          << " passed\n";
                 } else {
                   LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k
                              << ", bias: " << (has_bias ? "true" : "false")
@@ -364,10 +364,10 @@ TEST(TestSgemmC8, test_func_sgemm_c8_prepacked) {
                 auto flag = test_sgemm_c8(
                     m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
                 if (flag) {
-                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
-                            << ", bias: " << (has_bias ? "true" : "false")
-                            << ", relu: " << (has_relu ? "true" : "false")
-                            << " passed\n";
+                  VLOG(4) << "test m = " << m << ", n=" << n << ", k=" << k
+                          << ", bias: " << (has_bias ? "true" : "false")
+                          << ", relu: " << (has_relu ? "true" : "false")
+                          << " passed\n";
                 } else {
                   LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k
                              << ", bias: " << (has_bias ? "true" : "false")
diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc
index b3ca5ec6ed9876141f8e3d49451b2a9d0fda6269..c16c7332f6a0b954aae9708843d0af37dc492c35 100644
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
@@ -39,7 +39,15 @@ DEFINE_int32(power_mode,
 DEFINE_int32(threads, 1, "threads num");
 DEFINE_int32(warmup, 0, "warmup times");
 DEFINE_int32(repeats, 1, "repeats times");
+
+#ifdef LITE_WITH_ARM
+// sgemm_test wiil not be operated except that it's
+// on arm backend.
 DEFINE_bool(basic_test, true, "do all tests");
+#else
+DEFINE_bool(basic_test, false, "do all tests");
+#endif
+
 DEFINE_bool(check_result, true, "check the result");
 
 DEFINE_int32(M, 512, "gemm: M");
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
index 661c4f02aa7eafe807f77767dfd4db01a338993e..b4968a16b40fa7cf46885b35389b910e2db42ad1 100644
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -75,9 +75,9 @@ bool test_sgemv(bool tra,
   // fill_tensor_const(tb, 1.f);
   fill_tensor_rand(tbias, -1.f, 1.f);
 
-  LOG(INFO) << "sgemv M: " << m << ", K: " << k
-            << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
-            << ", bias: " << (has_bias ? "true" : "false");
+  VLOG(4) << "sgemv M: " << m << ", K: " << k
+          << ", transA: " << (tra ? "true" : "false") << ", act: " << flag_act
+          << ", bias: " << (has_bias ? "true" : "false");
 #ifdef LITE_WITH_ARM
 
   auto da = ta.mutable_data<float>();
@@ -209,11 +209,11 @@ TEST(TestLiteSgemv, Sgemv) {
                                        six,
                                        alpha);
                 if (flag) {
-                  LOG(INFO) << "test m = " << m << ", k=" << k
-                            << ", bias: " << (has_bias ? "true" : "false")
-                            << ", flag act: " << flag_act
-                            << ", trans A: " << (tra ? "true" : "false")
-                            << ", threads: " << th << " passed\n";
+                  VLOG(4) << "test m = " << m << ", k=" << k
+                          << ", bias: " << (has_bias ? "true" : "false")
+                          << ", flag act: " << flag_act
+                          << ", trans A: " << (tra ? "true" : "false")
+                          << ", threads: " << th << " passed\n";
                 } else {
                   LOG(FATAL) << "test m = " << m << ", k=" << k
                              << ", bias: " << (has_bias ? "true" : "false")
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index bbfa81be2d9b47ddeba132be7f841a992ca9de0d..eb0a7b1c04ff2ea6d473a6998b7146a60ff21867 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -37,7 +37,6 @@ WITH_HUAWEI_ASCEND_NPU=OFF # Huawei Ascend Builder/Runtime Libs on X86 host
 # default installation path, ensure acllib/atc/opp directories are all in this root dir
 HUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5"
 PYTHON_EXECUTABLE_OPTION=""
-ENABLE_FLATBUFFERS_DESC_VIEW=OFF
 IOS_DEPLOYMENT_TARGET=9.0
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
@@ -148,8 +147,7 @@ function make_tiny_publish_so {
       -DAPU_DDK_ROOT=$APU_DDK_ROOT \
       -DLITE_WITH_RKNPU=$BUILD_RKNPU \
       -DRKNPU_DDK_ROOT=$RKNPU_DDK_ROOT \
-      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang} \
-      -DLITE_ON_FLATBUFFERS_DESC_VIEW=${ENABLE_FLATBUFFERS_DESC_VIEW}
+      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}
 
   make publish_inference -j$NUM_PROC
   cd - > /dev/null
@@ -374,7 +372,7 @@ function make_x86 {
   build_directory=$BUILD_DIR/build.lite.x86
 
   if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then
-    export CXX=/usr/bin/g++ # Ascend need g++ in centos
+    export CXX=g++ # Huawei Ascend NPU need g++
     build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu
   fi
 
@@ -438,7 +436,6 @@ function print_usage {
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
     echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)"
     echo -e "--build_dir: directory for building"
-    echo -e "--enable_flatbuffers_view: (OFF|ON); Use the flatbuffers read-only view to load the model. If ON, the naive buffer will no longer be supported."
     echo -e "--ios_deployment_target: (default: 9.0); Set the minimum compatible system version for ios deployment."
     echo
     echo -e "argument choices:"
@@ -584,10 +581,6 @@ function main {
                 HUAWEI_ASCEND_NPU_DDK_ROOT="${i#*=}"
                 shift
                 ;;
-            --enable_flatbuffers_view=*)
-                ENABLE_FLATBUFFERS_DESC_VIEW="${i#*=}"
-                shift
-                ;;
             --ios_deployment_target=*)
                 IOS_DEPLOYMENT_TARGET="${i#*=}"
                 shift
diff --git a/lite/tools/build_windows.bat b/lite/tools/build_windows.bat
index 1fdb1e66c441fd8a7e6f3d678f3ac4393fdd2a28..5faad8819285e754ba3dfa33fc8911ccb8674efe 100644
--- a/lite/tools/build_windows.bat
+++ b/lite/tools/build_windows.bat
@@ -100,7 +100,6 @@ cd "%build_directory%"
             -DPYTHON_EXECUTABLE="%python_path%"
 
 call "%vcvarsall_dir%" amd64
-cd "%build_directory%"
 
 if "%BUILD_FOR_CI%"=="ON" (
     msbuild /m /p:Configuration=Release lite\lite_compile_deps.vcxproj
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 11c1a9edc6d9d770748a39216705df65590f56a3..166137bf02b034219f2d6afc6c486ed553cdfe7a 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -466,7 +466,7 @@ function test_arm_android {
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
 
-    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl" "test_transformer_with_mask_fp32_arm")
+    skip_list=("test_model_parser" "test_mobilenetv1" "test_mobilenetv2" "test_resnet50" "test_inceptionv4" "test_light_api" "test_apis" "test_paddle_api" "test_cxx_api" "test_gen_code" "test_mobilenetv1_int8" "test_subgraph_pass" "test_grid_sampler_image_opencl" "test_lrn_image_opencl" "test_pad2d_image_opencl" "test_transformer_with_mask_fp32_arm" "test_mobilenetv1_int16")
     for skip_name in ${skip_list[@]} ; do
         [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
@@ -1251,6 +1251,7 @@ function main {
                 build_test_arm_subtask_android
                 build_test_arm_subtask_model test_mobilenetv1 mobilenet_v1
                 build_test_arm_subtask_model test_mobilenetv1_int8 MobileNetV1_quant
+                build_test_arm_subtask_model test_mobilenetv1_int16 mobilenet_v1_int16
                 build_test_arm_subtask_model test_mobilenetv2 mobilenet_v2_relu
                 build_test_arm_subtask_model test_resnet50 resnet50
                 build_test_arm_subtask_model test_inceptionv4 inception_v4_simple
diff --git a/lite/utils/charconv.h b/lite/utils/charconv.h
index ee458e549c149b98ceb9101aabae87e27350e897..8ff0f4850daa06c259d1ad4c6909a9c75ec9e6fc 100644
--- a/lite/utils/charconv.h
+++ b/lite/utils/charconv.h
@@ -156,10 +156,11 @@ from_chars_result aton_unsigned(const char* str,
     }
     val += cv;
   }
-  if (UNLIKELY(i > std::numeric_limits<T>::digits10 + 1 ||
-               (i > std::numeric_limits<T>::digits10 &&
-                val > static_cast<uint64_t>(std::numeric_limits<T>::max())))) {
-    value = static_cast<T>(std::numeric_limits<T>::max());
+  if (UNLIKELY(
+          i > std::numeric_limits<T>::digits10 + 1 ||
+          (i > std::numeric_limits<T>::digits10 &&
+           val > static_cast<uint64_t>((std::numeric_limits<T>::max)())))) {
+    value = static_cast<T>((std::numeric_limits<T>::max)());
     result.ec = std::errc::result_out_of_range;
     return result;
   }
@@ -209,10 +210,11 @@ from_chars_result aton_signed(const char* str,
     val += cv;
   }
   if (LIKELY(!negative)) {
-    if (UNLIKELY(i > std::numeric_limits<T>::digits10 + 1 ||
-                 (i > std::numeric_limits<T>::digits10 &&
-                  val > static_cast<int64_t>(std::numeric_limits<T>::max())))) {
-      value = static_cast<T>(std::numeric_limits<T>::max());
+    if (UNLIKELY(
+            i > std::numeric_limits<T>::digits10 + 1 ||
+            (i > std::numeric_limits<T>::digits10 &&
+             val > static_cast<int64_t>((std::numeric_limits<T>::max)())))) {
+      value = static_cast<T>((std::numeric_limits<T>::max)());
       result.ec = std::errc::result_out_of_range;
       return result;
     }
diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc
index 5953b871f40f865591e1e933b4ecc492970a2837..78499ef0626cca027196fe623e1c90e3666783e4 100644
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
@@ -131,7 +131,7 @@ void ImageConvert::choose(const uint8_t* src,
   impl_(src, dst, srcw, srch);
 }
 /*
-nv21(yvu)  to BGR: stroe hwc dsth * dstw = srch * (srcw)
+nv12(yuv) to BGR: stroe hwc dsth * dstw = srch * (srcw)
 y_w = srcw, y_h = srch uv_w = srcw uv_h = 1/2 * srch
 R = Y + 1.402*(V-128);
 G = Y - 0.34414*(U-128) - 0.71414*(V-128);
@@ -141,16 +141,8 @@ ra = 1.402 *128 = 179.456 = 179
 ga = 0.34414 * 64 = 44.3721 = 44
 gb = 0.71414 * 64 = 91.40992 = 91
 ba = 1.772 * 62 = 226.816 = 227
-nv12bgr, nv21tobgr
 */
-void nv_to_bgr(const uint8_t* src,
-               uint8_t* dst,
-               int srcw,
-               int srch,
-               int x_num,
-               int y_num) {
-  // nv21 x = 0, y = 1
-  // nv12 x = 1, y = 0
+inline void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   int y_h = srch;
   int wout = srcw * 3;
   const uint8_t* y = src;
@@ -181,6 +173,698 @@ void nv_to_bgr(const uint8_t* src,
       ptr_bgr2 = writebuf;
     }
     int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+
+      g0 = vmlaq_s16(g0, gb, v_bias);
+
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+
+      vst3_u8(ptr_bgr1, v_bgr);
+
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+
+      vst3_u8(ptr_bgr1, v_bgr1);
+
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      ptr_bgr1 += 24;
+
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      uint8_t _y0 = ptr_y1[0];
+      uint8_t _y1 = ptr_y1[1];
+      uint8_t _v = ptr_vu[1];
+      uint8_t _u = ptr_vu[0];
+      uint8_t _y0_1 = ptr_y2[0];
+      uint8_t _y1_1 = ptr_y2[1];
+
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+  delete[] zerobuf;
+  delete[] writebuf;
+}
+
+/*
+nv21(yvu) to BGR: stroe hwc dsth * dstw = srch * (srcw)
+*/
+inline void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  int y_h = srch;
+  int wout = srcw * 3;
+  const uint8_t* y = src;
+  const uint8_t* vu = src + y_h * srcw;
+
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+
+  uint8_t* zerobuf = new uint8_t[srcw];
+  uint8_t* writebuf = new uint8_t[wout];
+  memset(zerobuf, 0, sizeof(uint8_t) * srcw);
+
+  int i = 0;
+#pragma omp parallel for
+  for (i = 0; i < y_h; i += 2) {
+    const uint8_t* ptr_y1 = y + i * srcw;
+    const uint8_t* ptr_y2 = ptr_y1 + srcw;
+    const uint8_t* ptr_vu = vu + (i / 2) * srcw;
+    uint8_t* ptr_bgr1 = dst + i * wout;
+    uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
+    if (i + 2 > y_h) {
+      ptr_y2 = zerobuf;
+      ptr_bgr2 = writebuf;
+    }
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
+    for (; j < srcw - 15; j += 16) {
+      uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
+                                         // y1y3y5...y15
+      uint8x8x2_t vu =
+          vld2_u8(ptr_vu);  // d0 = v0v1v2v3v4v5...v7 d1 = u0u1u2...u7
+
+      uint8x8x2_t y2 = vld2_u8(ptr_y2);
+
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
+      int16x8_t v_s = vreinterpretq_s16_u16(v);
+      int16x8_t u_s = vreinterpretq_s16_u16(u);
+      int16x8_t v_bias = vsubq_s16(v_s, bias);
+      int16x8_t u_bias = vsubq_s16(u_s, bias);
+
+      // G = Y - 0.34414*(U-128) - 0.71414*(V-128);
+      int16x8_t g0 = vmulq_s16(ga, u_bias);
+      // R = Y + 1.402*(V-128);
+      int16x8_t r0 = vmulq_s16(ra, v_bias);
+      // B = Y + 1.772*(U-128);
+      int16x8_t b0 = vmulq_s16(ba, u_bias);
+
+      g0 = vmlaq_s16(g0, gb, v_bias);
+
+      int16x8_t y1_0_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[0]));
+      int16x8_t y1_1_8 = vreinterpretq_s16_u16(vmovl_u8(y1.val[1]));
+
+      int16x8_t y2_0_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[0]));
+      int16x8_t y2_1_8 = vreinterpretq_s16_u16(vmovl_u8(y2.val[1]));
+
+      int16x8_t r0_bias = vshrq_n_s16(r0, 7);  // r0 / 128
+      int16x8_t b0_bias = vshrq_n_s16(b0, 7);
+      int16x8_t g0_bias = vshrq_n_s16(g0, 7);
+
+      int16x8_t r0_1 = vaddq_s16(y1_0_8, r0_bias);
+      int16x8_t b0_1 = vaddq_s16(y1_0_8, b0_bias);
+      int16x8_t g0_1 = vsubq_s16(y1_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r0_2 = vaddq_s16(y1_1_8, r0_bias);
+      int16x8_t b0_2 = vaddq_s16(y1_1_8, b0_bias);
+      int16x8_t g0_2 = vsubq_s16(y1_1_8, g0_bias);
+
+      r0_1 = vmaxq_s16(r0_1, zero);
+      b0_1 = vmaxq_s16(b0_1, zero);
+      g0_1 = vmaxq_s16(g0_1, zero);
+
+      r0_2 = vmaxq_s16(r0_2, zero);
+      b0_2 = vmaxq_s16(b0_2, zero);
+      g0_2 = vmaxq_s16(g0_2, zero);
+
+      r0_1 = vminq_s16(r0_1, max);
+      b0_1 = vminq_s16(b0_1, max);
+      g0_1 = vminq_s16(g0_1, max);
+
+      r0_2 = vminq_s16(r0_2, max);
+      b0_2 = vminq_s16(b0_2, max);
+      g0_2 = vminq_s16(g0_2, max);
+
+      uint8x8_t r00 = vreinterpret_u8_s8(vmovn_s16(r0_1));
+      uint8x8_t b00 = vreinterpret_u8_s8(vmovn_s16(b0_1));
+      uint8x8_t g00 = vreinterpret_u8_s8(vmovn_s16(g0_1));
+
+      uint8x8_t r01 = vreinterpret_u8_s8(vmovn_s16(r0_2));
+      uint8x8_t b01 = vreinterpret_u8_s8(vmovn_s16(b0_2));
+      uint8x8_t g01 = vreinterpret_u8_s8(vmovn_s16(g0_2));
+
+      int16x8_t r1_1 = vaddq_s16(y2_0_8, r0_bias);
+      int16x8_t b1_1 = vaddq_s16(y2_0_8, b0_bias);
+      int16x8_t g1_1 = vsubq_s16(y2_0_8, g0_bias);  // g0_1 = y1_0_8 - g0_1
+
+      int16x8_t r1_2 = vaddq_s16(y2_1_8, r0_bias);
+      int16x8_t b1_2 = vaddq_s16(y2_1_8, b0_bias);
+      int16x8_t g1_2 = vsubq_s16(y2_1_8, g0_bias);
+
+      uint8x8x2_t r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      uint8x8x2_t b00_0 = vtrn_u8(b00, b01);
+      uint8x8x2_t g00_0 = vtrn_u8(g00, g01);
+
+      r1_1 = vmaxq_s16(r1_1, zero);
+      b1_1 = vmaxq_s16(b1_1, zero);
+      g1_1 = vmaxq_s16(g1_1, zero);
+
+      r1_2 = vmaxq_s16(r1_2, zero);
+      b1_2 = vmaxq_s16(b1_2, zero);
+      g1_2 = vmaxq_s16(g1_2, zero);
+
+      uint16x4_t r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      uint16x4_t r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      uint16x4_t b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      uint16x4_t b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      uint16x4_t g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      uint16x4_t g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      uint16x4x2_t r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      uint16x4x2_t b00_1 = vtrn_u16(b0_16, b1_16);
+      uint16x4x2_t g00_1 = vtrn_u16(g0_16, g1_16);
+
+      r1_1 = vminq_s16(r1_1, max);
+      b1_1 = vminq_s16(b1_1, max);
+      g1_1 = vminq_s16(g1_1, max);
+
+      r1_2 = vminq_s16(r1_2, max);
+      b1_2 = vminq_s16(b1_2, max);
+      g1_2 = vminq_s16(g1_2, max);
+
+      uint32x2_t r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      uint32x2_t r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      uint32x2_t b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      uint32x2_t b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      uint32x2_t g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      uint32x2_t g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      uint32x2x2_t r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      uint32x2x2_t b00_2 = vtrn_u32(b0_32, b1_32);
+      uint32x2x2_t g00_2 = vtrn_u32(g0_32, g1_32);
+
+      r00 = vreinterpret_u8_s8(vmovn_s16(r1_1));
+      b00 = vreinterpret_u8_s8(vmovn_s16(b1_1));
+      g00 = vreinterpret_u8_s8(vmovn_s16(g1_1));
+
+      r01 = vreinterpret_u8_s8(vmovn_s16(r1_2));
+      b01 = vreinterpret_u8_s8(vmovn_s16(b1_2));
+      g01 = vreinterpret_u8_s8(vmovn_s16(g1_2));
+
+      uint8x8_t r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      uint8x8_t b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      uint8x8_t g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      uint8x8_t r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      uint8x8x3_t v_bgr;
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      r00_0 = vtrn_u8(r00, r01);  // 014589  236710
+      b00_0 = vtrn_u8(b00, b01);
+      g00_0 = vtrn_u8(g00, g01);
+
+      vst3_u8(ptr_bgr1, v_bgr);
+
+      r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
+      r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
+
+      b0_16 = vreinterpret_u16_u8(b00_0.val[0]);
+      b1_16 = vreinterpret_u16_u8(b00_0.val[1]);
+
+      g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
+      g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
+
+      ptr_bgr1 += 24;
+      uint8x8x3_t v_bgr1;
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
+      b00_1 = vtrn_u16(b0_16, b1_16);
+      g00_1 = vtrn_u16(g0_16, g1_16);
+
+      vst3_u8(ptr_bgr1, v_bgr1);
+
+      r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
+      r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
+
+      b0_32 = vreinterpret_u32_u16(b00_1.val[0]);
+      b1_32 = vreinterpret_u32_u16(b00_1.val[1]);
+
+      g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
+      g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
+
+      ptr_bgr1 += 24;
+
+      r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
+      b00_2 = vtrn_u32(b0_32, b1_32);
+      g00_2 = vtrn_u32(g0_32, g1_32);
+
+      ptr_vu += 16;
+      ptr_y1 += 16;
+      ptr_y2 += 16;
+
+      r0_8 = vreinterpret_u8_u32(r00_2.val[0]);
+      b0_8 = vreinterpret_u8_u32(b00_2.val[0]);
+      g0_8 = vreinterpret_u8_u32(g00_2.val[0]);
+
+      r1_8 = vreinterpret_u8_u32(r00_2.val[1]);
+      b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
+      g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
+
+      v_bgr.val[0] = b0_8;
+      v_bgr.val[1] = g0_8;
+      v_bgr.val[2] = r0_8;
+
+      v_bgr1.val[0] = b1_8;
+      v_bgr1.val[1] = g1_8;
+      v_bgr1.val[2] = r1_8;
+
+      vst3_u8(ptr_bgr2, v_bgr);
+      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+
+      ptr_bgr2 += 48;
+    }
+    // two data
+    for (; j < srcw; j += 2) {
+      uint8_t _y0 = ptr_y1[0];
+      uint8_t _y1 = ptr_y1[1];
+      uint8_t _v = ptr_vu[0];
+      uint8_t _u = ptr_vu[1];
+      uint8_t _y0_1 = ptr_y2[0];
+      uint8_t _y1_1 = ptr_y2[1];
+
+      int ra = floor((179 * (_v - 128)) >> 7);
+      int ga = floor((44 * (_u - 128) + 91 * (_v - 128)) >> 7);
+      int ba = floor((227 * (_u - 128)) >> 7);
+
+      int r = _y0 + ra;
+      int g = _y0 - ga;
+      int b = _y0 + ba;
+
+      int r1 = _y1 + ra;
+      int g1 = _y1 - ga;
+      int b1 = _y1 + ba;
+
+      r = r < 0 ? 0 : (r > 255) ? 255 : r;
+      g = g < 0 ? 0 : (g > 255) ? 255 : g;
+      b = b < 0 ? 0 : (b > 255) ? 255 : b;
+
+      r1 = r1 < 0 ? 0 : (r1 > 255) ? 255 : r1;
+      g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
+      b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
+
+      *ptr_bgr1++ = b;
+      *ptr_bgr1++ = g;
+      *ptr_bgr1++ = r;
+
+      int r2 = _y0_1 + ra;
+      int g2 = _y0_1 - ga;
+      int b2 = _y0_1 + ba;
+
+      int r3 = _y1_1 + ra;
+      int g3 = _y1_1 - ga;
+      int b3 = _y1_1 + ba;
+
+      r2 = r2 < 0 ? 0 : (r2 > 255) ? 255 : r2;
+      g2 = g2 < 0 ? 0 : (g2 > 255) ? 255 : g2;
+      b2 = b2 < 0 ? 0 : (b2 > 255) ? 255 : b2;
+
+      r3 = r3 < 0 ? 0 : (r3 > 255) ? 255 : r3;
+      g3 = g3 < 0 ? 0 : (g3 > 255) ? 255 : g3;
+      b3 = b3 < 0 ? 0 : (b3 > 255) ? 255 : b3;
+
+      *ptr_bgr1++ = b1;
+      *ptr_bgr1++ = g1;
+      *ptr_bgr1++ = r1;
+
+      *ptr_bgr2++ = b2;
+      *ptr_bgr2++ = g2;
+      *ptr_bgr2++ = r2;
+
+      ptr_y1 += 2;
+      ptr_y2 += 2;
+      ptr_vu += 2;
+
+      *ptr_bgr2++ = b3;
+      *ptr_bgr2++ = g3;
+      *ptr_bgr2++ = r3;
+    }
+  }
+  delete[] zerobuf;
+  delete[] writebuf;
+}
+
+// nv12(yuv) to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
+// srch uv_w = srcw uv_h = 1/2 * srch
+inline void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  int y_h = srch;
+  int vu_h = 1 / 2 * srch;
+  const uint8_t* y = src;
+  const uint8_t* vu = src + y_h * srcw;
+  int wout = srcw * 4;
+
+  uint8_t* zerobuf = new uint8_t[srcw];
+  uint8_t* writebuf = new uint8_t[wout];
+  memset(zerobuf, 0, sizeof(uint8_t) * srcw);
+
+  int16x8_t bias = vdupq_n_s16(128);
+  int16x8_t ga = vdupq_n_s16(44);
+  int16x8_t ra = vdupq_n_s16(179);
+  int16x8_t ba = vdupq_n_s16(227);
+  int16x8_t gb = vdupq_n_s16(91);
+  int16x8_t zero = vdupq_n_s16(0);
+  int16x8_t max = vdupq_n_s16(255);
+  uint8x8_t a_8 = vdup_n_u8(255);
+#pragma omp parallel for
+  for (int i = 0; i < y_h; i += 2) {
+    const uint8_t* ptr_y1 = y + i * srcw;
+    const uint8_t* ptr_y2 = ptr_y1 + srcw;
+    const uint8_t* ptr_vu = vu + (i / 2) * srcw;
+    uint8_t* ptr_bgr1 = dst + i * wout;
+    uint8_t* ptr_bgr2 = ptr_bgr1 + wout;
+    if (i + 2 > y_h) {
+      ptr_y2 = zerobuf;
+      ptr_bgr2 = writebuf;
+    }
+    int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
     for (; j < srcw - 15; j += 16) {
       uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
                                          // y1y3y5...y15
@@ -189,8 +873,8 @@ void nv_to_bgr(const uint8_t* src,
 
       uint8x8x2_t y2 = vld2_u8(ptr_y2);
 
-      uint16x8_t v = vmovl_u8(vu.val[x_num]);
-      uint16x8_t u = vmovl_u8(vu.val[y_num]);
+      uint16x8_t v = vmovl_u8(vu.val[1]);
+      uint16x8_t u = vmovl_u8(vu.val[0]);
       int16x8_t v_s = vreinterpretq_s16_u16(v);
       int16x8_t u_s = vreinterpretq_s16_u16(u);
       int16x8_t v_bias = vsubq_s16(v_s, bias);
@@ -317,16 +1001,17 @@ void nv_to_bgr(const uint8_t* src,
       uint8x8_t b1_8 = vreinterpret_u8_u32(b00_2.val[1]);
       uint8x8_t g1_8 = vreinterpret_u8_u32(g00_2.val[1]);
 
-      uint8x8x3_t v_bgr;
+      uint8x8x4_t v_bgr;
       v_bgr.val[0] = b0_8;
       v_bgr.val[1] = g0_8;
       v_bgr.val[2] = r0_8;
+      v_bgr.val[3] = a_8;
 
       r00_0 = vtrn_u8(r00, r01);  // 014589  236710
       b00_0 = vtrn_u8(b00, b01);
       g00_0 = vtrn_u8(g00, g01);
 
-      vst3_u8(ptr_bgr1, v_bgr);
+      vst4_u8(ptr_bgr1, v_bgr);
 
       r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
       r1_16 = vreinterpret_u16_u8(r00_0.val[1]);
@@ -337,17 +1022,20 @@ void nv_to_bgr(const uint8_t* src,
       g0_16 = vreinterpret_u16_u8(g00_0.val[0]);
       g1_16 = vreinterpret_u16_u8(g00_0.val[1]);
 
-      ptr_bgr1 += 24;
-      uint8x8x3_t v_bgr1;
+      ptr_bgr1 += 32;
+      // uint8x8x3_t v_bgr1;
+      uint8x8x4_t v_bgr1;
       v_bgr1.val[0] = b1_8;
       v_bgr1.val[1] = g1_8;
       v_bgr1.val[2] = r1_8;
+      v_bgr1.val[3] = a_8;
 
       r00_1 = vtrn_u16(r0_16, r1_16);  // 012389 456710
       b00_1 = vtrn_u16(b0_16, b1_16);
       g00_1 = vtrn_u16(g0_16, g1_16);
 
-      vst3_u8(ptr_bgr1, v_bgr1);
+      // vst3_u8(ptr_bgr1, v_bgr1);
+      vst4_u8(ptr_bgr1, v_bgr1);
 
       r0_32 = vreinterpret_u32_u16(r00_1.val[0]);
       r1_32 = vreinterpret_u32_u16(r00_1.val[1]);
@@ -358,7 +1046,8 @@ void nv_to_bgr(const uint8_t* src,
       g0_32 = vreinterpret_u32_u16(g00_1.val[0]);
       g1_32 = vreinterpret_u32_u16(g00_1.val[1]);
 
-      ptr_bgr1 += 24;
+      // ptr_bgr1 += 24;
+      ptr_bgr1 += 32;
 
       r00_2 = vtrn_u32(r0_32, r1_32);  // 01234567 8910
       b00_2 = vtrn_u32(b0_32, b1_32);
@@ -384,17 +1073,17 @@ void nv_to_bgr(const uint8_t* src,
       v_bgr1.val[1] = g1_8;
       v_bgr1.val[2] = r1_8;
 
-      vst3_u8(ptr_bgr2, v_bgr);
-      vst3_u8(ptr_bgr2 + 24, v_bgr1);
+      vst4_u8(ptr_bgr2, v_bgr);
+      vst4_u8(ptr_bgr2 + 32, v_bgr1);
 
-      ptr_bgr2 += 48;
+      ptr_bgr2 += 64;
     }
     // two data
     for (; j < srcw; j += 2) {
       uint8_t _y0 = ptr_y1[0];
       uint8_t _y1 = ptr_y1[1];
-      uint8_t _v = ptr_vu[x_num];
-      uint8_t _u = ptr_vu[y_num];
+      uint8_t _v = ptr_vu[1];
+      uint8_t _u = ptr_vu[0];
       uint8_t _y0_1 = ptr_y2[0];
       uint8_t _y1_1 = ptr_y2[1];
 
@@ -421,6 +1110,7 @@ void nv_to_bgr(const uint8_t* src,
       *ptr_bgr1++ = b;
       *ptr_bgr1++ = g;
       *ptr_bgr1++ = r;
+      *ptr_bgr1++ = 255;
 
       int r2 = _y0_1 + ra;
       int g2 = _y0_1 - ga;
@@ -441,10 +1131,12 @@ void nv_to_bgr(const uint8_t* src,
       *ptr_bgr1++ = b1;
       *ptr_bgr1++ = g1;
       *ptr_bgr1++ = r1;
+      *ptr_bgr1++ = 255;
 
       *ptr_bgr2++ = b2;
       *ptr_bgr2++ = g2;
       *ptr_bgr2++ = r2;
+      *ptr_bgr2++ = 255;
 
       ptr_y1 += 2;
       ptr_y2 += 2;
@@ -453,20 +1145,16 @@ void nv_to_bgr(const uint8_t* src,
       *ptr_bgr2++ = b3;
       *ptr_bgr2++ = g3;
       *ptr_bgr2++ = r3;
+      *ptr_bgr2++ = 255;
     }
   }
   delete[] zerobuf;
   delete[] writebuf;
 }
-// nv12bgra, nv21tobgra
-void nv_to_bgra(const uint8_t* src,
-                uint8_t* dst,
-                int srcw,
-                int srch,
-                int x_num,
-                int y_num) {
-  // nv21 x = 0, y = 1
-  // nv12 x = 1, y = 0
+
+// nv21(yvu) to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
+// uv_w = srcw uv_h = 1/2 * srch
+inline void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   int y_h = srch;
   int vu_h = 1 / 2 * srch;
   const uint8_t* y = src;
@@ -497,6 +1185,29 @@ void nv_to_bgra(const uint8_t* src,
       ptr_bgr2 = writebuf;
     }
     int j = 0;
+#ifdef __aarch64__
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr_y1]]                \n"
+        "prfm   pldl1keep, [%[ptr_y1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_y2]]        \n"
+        "prfm   pldl1keep, [%[ptr_y2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr_vu]]        \n"
+        "prfm   pldl1keep, [%[ptr_vu], #64]   \n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#else
+    asm volatile(
+        "pld [%[ptr_y1]]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y1], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_y2]]            @ preload a, 64byte\n"
+        "pld [%[ptr_y2], #128]                         @ preload a, 64byte\n"
+        "pld [%[ptr_vu]]            @ preload a, 64byte\n"
+        "pld [%[ptr_vu], #128]                         @ preload a, 64byte\n"
+        :
+        : [ptr_y1] "r"(ptr_y1), [ptr_y2] "r"(ptr_y2), [ptr_vu] "r"(ptr_vu)
+        : "memory");
+#endif
     for (; j < srcw - 15; j += 16) {
       uint8x8x2_t y1 = vld2_u8(ptr_y1);  // d8 = y0y2y4y6...y14 d9 =
                                          // y1y3y5...y15
@@ -505,8 +1216,8 @@ void nv_to_bgra(const uint8_t* src,
 
       uint8x8x2_t y2 = vld2_u8(ptr_y2);
 
-      uint16x8_t v = vmovl_u8(vu.val[x_num]);
-      uint16x8_t u = vmovl_u8(vu.val[y_num]);
+      uint16x8_t v = vmovl_u8(vu.val[0]);
+      uint16x8_t u = vmovl_u8(vu.val[1]);
       int16x8_t v_s = vreinterpretq_s16_u16(v);
       int16x8_t u_s = vreinterpretq_s16_u16(u);
       int16x8_t v_bias = vsubq_s16(v_s, bias);
@@ -643,10 +1354,6 @@ void nv_to_bgra(const uint8_t* src,
       b00_0 = vtrn_u8(b00, b01);
       g00_0 = vtrn_u8(g00, g01);
 
-      // ptr_bgr3 += 8;
-      // ptr_bgr1 += 8;
-      // ptr_bgr2 += 8;
-      // vst3_u8(ptr_bgr1, v_bgr);
       vst4_u8(ptr_bgr1, v_bgr);
 
       r0_16 = vreinterpret_u16_u8(r00_0.val[0]);
@@ -709,8 +1416,6 @@ void nv_to_bgra(const uint8_t* src,
       v_bgr1.val[1] = g1_8;
       v_bgr1.val[2] = r1_8;
 
-      // vst3_u8(ptr_bgr2, v_bgr);
-      // vst3_u8(ptr_bgr2 + 24, v_bgr1);
       vst4_u8(ptr_bgr2, v_bgr);
       vst4_u8(ptr_bgr2 + 32, v_bgr1);
 
@@ -720,8 +1425,8 @@ void nv_to_bgra(const uint8_t* src,
     for (; j < srcw; j += 2) {
       uint8_t _y0 = ptr_y1[0];
       uint8_t _y1 = ptr_y1[1];
-      uint8_t _v = ptr_vu[x_num];
-      uint8_t _u = ptr_vu[y_num];
+      uint8_t _v = ptr_vu[0];
+      uint8_t _u = ptr_vu[1];
       uint8_t _y0_1 = ptr_y2[0];
       uint8_t _y1_1 = ptr_y2[1];
 
@@ -745,9 +1450,6 @@ void nv_to_bgra(const uint8_t* src,
       g1 = g1 < 0 ? 0 : (g1 > 255) ? 255 : g1;
       b1 = b1 < 0 ? 0 : (b1 > 255) ? 255 : b1;
 
-      // *ptr_bgr1++ = b;
-      // *ptr_bgr2++ = g;
-      // *ptr_bgr3++ = r;
       *ptr_bgr1++ = b;
       *ptr_bgr1++ = g;
       *ptr_bgr1++ = r;
@@ -792,26 +1494,7 @@ void nv_to_bgra(const uint8_t* src,
   delete[] zerobuf;
   delete[] writebuf;
 }
-void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  nv_to_bgr(src, dst, srcw, srch, 0, 1);
-}
 
-// nv12(yuv)  to BGR:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
-// uv_w = srcw uv_h = 1/2 * srch
-void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  // exchange vu forward
-  nv_to_bgr(src, dst, srcw, srch, 1, 0);
-}
-// nv21(yvu)  to BGRA: stroe hwc dsth * dstw = srch * (srcw) y_w = srcw, y_h =
-// srch uv_w = srcw uv_h = 1/2 * srch
-void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  nv_to_bgra(src, dst, srcw, srch, 0, 1);
-}
-// nv12(yuv)  to BGRA:store hwc dsth * dstw = srch * srcw y_w = srcw, y_h = srch
-// uv_w = srcw uv_h = 1/2 * srch
-void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
-  nv_to_bgra(src, dst, srcw, srch, 1, 0);
-}
 /*
 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
@@ -847,7 +1530,6 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
     uint8_t* outr1 = outr0 + srcw;
     uint8_t* outr2 = outr1 + srcw;
     uint8_t* outr3 = outr2 + srcw;
-
     int cnt = cnt_pro;
     if (cnt > 0) {
 #ifdef __aarch64__
diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc
index 7b7936935d0c26e4d1f023f77063ce9ee8dd73ec..44aa40615ae87c90e64df33979ad5edea4e27f4b 100644
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
@@ -153,7 +153,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                   // 26, 27}"
           "ld1  {v3.8b}, [%[inptr3]], #8    \n"   // v0={30,31,32, 33, 34, 35,
                                                   // 36, 37}"
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st1 {v0.8b}, [%[outptr0]], #8             \n"   // 00 10 20 30 04 14
                                                            // 24 34
           "st1 {v1.8b}, [%[outptr1]], #8              \n"  // 02 12 22 32
@@ -180,6 +183,10 @@ void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "26 27\n"
           "vld1.8  {d12}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 33 34 35 "
           "36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
 
           "vst1.32  {d0},    [%[outptr0]]!   @ write d0(q0,low),r00,r10 20 30\n"
           "vst1.32  {d4},    [%[outptr1]]!   @ write d4(q0,low),r01,r11 21 31\n"
@@ -286,7 +293,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                    // 01 00
           "rev64  v7.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03 02
                                                    // 01 00
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st1 {v4.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
           "st1 {v5.8b}, [%[outptr1]]              \n"  // 02 12 22 32
           "st1 {v6.8b}, [%[outptr2]]             \n"   // 01 11 21 31
@@ -324,7 +334,10 @@ void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "vrev64.8  d9, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
           "vrev64.8  d13, d12               @ reverse 07 06 05 04 03 02 01 00 "
           "\n"
-
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
           "vst1.32  {d1},    [%[outptr0]]   @ write d0(q0,low),r00,r10 20 30\n"
           "vst1.32  {d5},    [%[outptr1]]   @ write d4(q0,low),r01,r11 21 31\n"
           "vst1.32  {d9},    [%[outptr2]]   @ write d4(q0,low),r01,r11 21 31\n"
@@ -440,7 +453,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                    // 01 00
           "rev64  v7.8b, v3.8b                \n"  //@ reverse 07 06 05 04 03 02
                                                    // 01 00
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st1 {v4.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
           "st1 {v5.8b}, [%[outptr1]]              \n"  // 02 12 22 32
           "st1 {v6.8b}, [%[outptr2]]             \n"   // 01 11 21 31
@@ -478,7 +494,10 @@ void flip_hwc1_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "vrev64.8  d9, d8               @ reverse 07 06 05 04 03 02 01 00 \n"
           "vrev64.8  d13, d12               @ reverse 07 06 05 04 03 02 01 00 "
           "\n"
-
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
           "vst1.32  {d1},    [%[outptr0]]   @ write d0(q0,low),r00,r10 20 30\n"
           "vst1.32  {d5},    [%[outptr1]]   @ write d4(q0,low),r01,r11 21 31\n"
           "vst1.32  {d9},    [%[outptr2]]   @ write d4(q0,low),r01,r11 21 31\n"
@@ -583,7 +602,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
                                                                   // 33, 34, 35,
                                                                   // 36, 37}"
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st3 {v0.8b, v1.8b, v2.8b}, [%[outptr0]], #24             \n"   // 00
                                                                           // 10
                                                                           // 20
@@ -634,6 +656,10 @@ void flip_hwc3_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "vld3.8  {d9, d10, d11}, [%[inptr3]]!   @ zip load r1, d6 = 30 31 32 "
           "33 34 35 36 37\n"
 
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
           "vst3.8  {d0, d1, d2},    [%[outptr0]]!   @ write d0(q0,low),r00,r10 "
           "20 30\n"
           "vst3.8  {d3, d4, d5},    [%[outptr1]]!   @ write d4(q0,low),r01,r11 "
@@ -748,7 +774,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "ld3  {v9.8b, v10.8b, v11.8b}, [%[inptr3]], #24    \n"  // v0={30,31,32,
                                                                   // 33, 34, 35,
                                                                   // 36, 37}"
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "rev64  v12.8b, v0.8b                \n"  //@ reverse 07 06 05 04 03
                                                     // 02 01 00 b
           "rev64  v13.8b, v1.8b                \n"  //@ reverse 07 06 05 04 03
@@ -855,7 +884,10 @@ void flip_hwc3_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "\n"
           "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
           "\n"
-
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
           "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
           "d0(q0,low),r00,r10 20 30\n"
           "vst3.8  {d15, d16, d17},    [%[outptr1]]   @ write "
@@ -1027,7 +1059,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                      // 02 01 00
           "rev64  v23.8b, v11.8b                \n"  //@ reverse 07 06 05 04 03
                                                      // 02 01 00
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st3 {v12.8b, v13.8b, v14.8b}, [%[outptr0]]             \n"   // 00 10
                                                                         // 20 30
                                                                         // 04 14
@@ -1106,6 +1141,10 @@ void flip_hwc3_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "\n"
           "vrev64.8  d23, d11               @ reverse 07 06 05 04 03 02 01 00 "
           "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
 
           "vst3.8  {d12, d13, d14},    [%[outptr0]]   @ write "
           "d0(q0,low),r00,r10 20 30\n"
@@ -1262,7 +1301,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           // 35,
           // 36,
           // 37}"
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr0]], #32  \n"  // 00 10 20
                                                                      // 30 04 14
                                                                      // 24 34
@@ -1306,6 +1348,10 @@ void flip_hwc4_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "22 23 24 25 26 27\n"
           "vld4.8  {d12, d13, d14, d15}, [%[inptr3]]!   @ zip load r1, d6 = 30 "
           "31 32 33 34 35 36 37\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
 
           "vst4.8  {d0, d1, d2, d3},    [%[outptr0]]!   @ write "
           "d0(q0,low),r00,r10 20 30\n"
@@ -1476,7 +1522,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                     // 02 01 00
           "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
                                                     // 02 01 00
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
           "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
           "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
@@ -1571,6 +1620,10 @@ void flip_hwc4_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "\n"
           "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 "
           "\n"
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
 
           "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
           "d0(q0,low),r00,r10 20 30\n"
@@ -1770,7 +1823,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
                                                     // 02 01 00
           "rev64  v7.8b, v15.8b                \n"  //@ reverse 07 06 05 04 03
                                                     // 02 01 00
-
+          "prfm   pldl1keep, [%[inptr0]]        \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr2]]        \n"
+          "prfm   pldl1keep, [%[inptr3]]        \n"
           "st4 {v16.8b, v17.8b, v18.8b, v19.8b}, [%[outptr0]]             \n"  // 00 10 20 30 04 14 24 34
           "st4 {v20.8b, v21.8b, v22.8b, v23.8b}, [%[outptr1]]              \n"  // 02 12 22 32
           "st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [%[outptr2]]             \n"  // 01
@@ -1868,6 +1924,10 @@ void flip_hwc4_xy(const uint8_t* src, uint8_t* dst, int w_in, int h_in) {
           "vrev64.8  d7, d15               @ reverse 07 06 05 04 03 02 01 00 "
           "\n"
 
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]                         @ preload a, 64byte\n"
           "vst4.8  {d16, d17, d18, d19},    [%[outptr0]]   @ write "
           "d0(q0,low),r00,r10 20 30\n"
           "vst4.8  {d20, d21, d22, d23},    [%[outptr1]]   @ write "
diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc
index 1a971bf78b50f149b9d1ce781d943d906ea902e4..3029c52edbdf4a25c71d9314bc2bbaca72b6d272 100644
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -51,9 +51,44 @@ void ImageResize::choose(const uint8_t* src,
                          int dsth) {
   resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
+
+void resize_one_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+
+void resize_one_channel_uv(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+
 void resize_three_channel(
     const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
 
+void resize_four_channel(
+    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
+
+void nv21_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
+    return;
+  }
+  //     return;
+  int y_h = h_in;
+  int uv_h = h_in / 2;
+  const uint8_t* y_ptr = src;
+  const uint8_t* uv_ptr = src + y_h * w_in;
+  // out
+  int dst_y_h = h_out;
+  int dst_uv_h = h_out / 2;
+  uint8_t* dst_ptr = dst + dst_y_h * w_out;
+  // y
+  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
+  // uv
+  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
+}
+
 void bgr_resize(const uint8_t* src,
                 uint8_t* dst,
                 int w_in,
@@ -67,36 +102,57 @@ void bgr_resize(const uint8_t* src,
   // y
   resize_three_channel(src, w_in * 3, h_in, dst, w_out * 3, h_out);
 }
-void resize_three_channel(const uint8_t* src,
-                          int w_in,
-                          int h_in,
-                          uint8_t* dst,
-                          int w_out,
-                          int h_out) {
+
+void bgra_resize(const uint8_t* src,
+                 uint8_t* dst,
+                 int w_in,
+                 int h_in,
+                 int w_out,
+                 int h_out) {
+  if (w_out == w_in && h_out == h_in) {
+    memcpy(dst, src, sizeof(uint8_t) * w_in * h_in * 4);
+    return;
+  }
+  // y
+  resize_four_channel(src, w_in * 4, h_in, dst, w_out * 4, h_out);
+}
+
+void resize_one_channel(const uint8_t* src,
+                        int w_in,
+                        int h_in,
+                        uint8_t* dst,
+                        int w_out,
+                        int h_out) {
   const int resize_coef_bits = 11;
   const int resize_coef_scale = 1 << resize_coef_bits;
+
   double scale_x = static_cast<double>(w_in) / w_out;
   double scale_y = static_cast<double>(h_in) / h_out;
+
   int* buf = new int[w_out * 2 + h_out * 2];
+
   int* xofs = buf;          // new int[w];
   int* yofs = buf + w_out;  // new int[h];
+
   int16_t* ialpha =
-      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
   int16_t* ibeta =
       reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+
   float fx = 0.f;
   float fy = 0.f;
-  int sx = 0.f;
-  int sy = 0.f;
+  int sx = 0;
+  int sy = 0;
+
 #define SATURATE_CAST_SHORT(X)                                               \
   (int16_t)::std::min(                                                       \
       ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
       SHRT_MAX);
-  // #pragma omp parallel for
-  for (int dx = 0; dx < w_out / 3; dx++) {
+  for (int dx = 0; dx < w_out; dx++) {
     fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
     sx = floor(fx);
     fx -= sx;
+
     if (sx < 0) {
       sx = 0;
       fx = 0.f;
@@ -105,17 +161,20 @@ void resize_three_channel(const uint8_t* src,
       sx = w_in - 2;
       fx = 1.f;
     }
-    xofs[dx] = sx * 3;
+
+    xofs[dx] = sx;
+
     float a0 = (1.f - fx) * resize_coef_scale;
     float a1 = fx * resize_coef_scale;
+
     ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
     ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
   }
-  // #pragma omp parallel for
   for (int dy = 0; dy < h_out; dy++) {
     fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
     sy = floor(fy);
     fy -= sy;
+
     if (sy < 0) {
       sy = 0;
       fy = 0.f;
@@ -124,9 +183,12 @@ void resize_three_channel(const uint8_t* src,
       sy = h_in - 2;
       fy = 1.f;
     }
+
     yofs[dy] = sy;
+
     float b0 = (1.f - fy) * resize_coef_scale;
     float b1 = fy * resize_coef_scale;
+
     ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
     ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
   }
@@ -136,9 +198,11 @@ void resize_three_channel(const uint8_t* src,
   int16_t* rowsbuf1 = new int16_t[w_out + 1];
   int16_t* rows0 = rowsbuf0;
   int16_t* rows1 = rowsbuf1;
+
   int prev_sy1 = -1;
   for (int dy = 0; dy < h_out; dy++) {
     int sy = yofs[dy];
+
     if (sy == prev_sy1) {
       // hresize one row
       int16_t* rows0_old = rows0;
@@ -147,72 +211,80 @@ void resize_three_channel(const uint8_t* src,
       const uint8_t* S1 = src + w_in * (sy + 1);
       const int16_t* ialphap = ialpha;
       int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 3; dx++) {
+      for (int dx = 0; dx < w_out; dx++) {
         int sx = xofs[dx];
         int16_t a0 = ialphap[0];
         int16_t a1 = ialphap[1];
+
         const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 3;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
-        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
         ialphap += 2;
       }
     } else {
       // hresize two rows
       const uint8_t* S0 = src + w_in * (sy);
       const uint8_t* S1 = src + w_in * (sy + 1);
+
       const int16_t* ialphap = ialpha;
       int16_t* rows0p = rows0;
       int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 3; dx++) {
+      for (int dx = 0; dx < w_out; dx++) {
         int sx = xofs[dx];
         int16_t a0 = ialphap[0];
         int16_t a1 = ialphap[1];
+
         const uint8_t* S0p = S0 + sx;
         const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 3;
-        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
-        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
-        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
-        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
+        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
+        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+
         ialphap += 2;
       }
     }
+
     prev_sy1 = sy + 1;
+
     // vresize
     int16_t b0 = ibeta[0];
     int16_t b1 = ibeta[1];
+
     int16_t* rows0p = rows0;
     int16_t* rows1p = rows1;
     uint8_t* dp_ptr = dst + w_out * (dy);
+
     int cnt = w_out >> 3;
     int remain = w_out - (cnt << 3);
     int16x4_t _b0 = vdup_n_s16(b0);
     int16x4_t _b1 = vdup_n_s16(b1);
     int32x4_t _v2 = vdupq_n_s32(2);
+
     for (cnt = w_out >> 3; cnt > 0; cnt--) {
       int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
       int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
       int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
       int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+
       int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
       int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
       int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
       int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+
       int32x4_t _acc = _v2;
-      _acc = vsraq_n_s32(
-          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
       _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+
       int32x4_t _acc_1 = _v2;
       _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
       _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
+
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
       int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+
       uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+
       vst1_u8(dp_ptr, _dout);
+
       dp_ptr += 8;
       rows0p += 8;
       rows1p += 8;
@@ -226,45 +298,18 @@ void resize_three_channel(const uint8_t* src,
     }
     ibeta += 2;
   }
+
   delete[] buf;
   delete[] rowsbuf0;
   delete[] rowsbuf1;
 }
-void resize_one_channel(
-    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
-void resize_one_channel_uv(
-    const uint8_t* src, int w_in, int h_in, uint8_t* dst, int w_out, int h_out);
-void nv21_resize(const uint8_t* src,
-                 uint8_t* dst,
-                 int w_in,
-                 int h_in,
-                 int w_out,
-                 int h_out) {
-  if (w_out == w_in && h_out == h_in) {
-    memcpy(dst, src, sizeof(uint8_t) * w_in * static_cast<int>(1.5 * h_in));
-    return;
-  }
-  //     return;
-  int y_h = h_in;
-  int uv_h = h_in / 2;
-  const uint8_t* y_ptr = src;
-  const uint8_t* uv_ptr = src + y_h * w_in;
-  // out
-  int dst_y_h = h_out;
-  int dst_uv_h = h_out / 2;
-  uint8_t* dst_ptr = dst + dst_y_h * w_out;
-  // y
-  resize_one_channel(y_ptr, w_in, y_h, dst, w_out, dst_y_h);
-  // uv
-  resize_one_channel_uv(uv_ptr, w_in, uv_h, dst_ptr, w_out, dst_uv_h);
-}
 
-void resize_one_channel(const uint8_t* src,
-                        int w_in,
-                        int h_in,
-                        uint8_t* dst,
-                        int w_out,
-                        int h_out) {
+void resize_one_channel_uv(const uint8_t* src,
+                           int w_in,
+                           int h_in,
+                           uint8_t* dst,
+                           int w_out,
+                           int h_out) {
   const int resize_coef_bits = 11;
   const int resize_coef_scale = 1 << resize_coef_bits;
 
@@ -277,20 +322,20 @@ void resize_one_channel(const uint8_t* src,
   int* yofs = buf + w_out;  // new int[h];
 
   int16_t* ialpha =
-      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new short[w * 2];
-  int16_t* ibeta =
-      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
+                                              h_out);  // new int16_t[h * 2];
 
   float fx = 0.f;
   float fy = 0.f;
-  int sx = 0;
-  int sy = 0;
+  int sx = 0.f;
+  int sy = 0.f;
 
 #define SATURATE_CAST_SHORT(X)                                               \
   (int16_t)::std::min(                                                       \
       ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
       SHRT_MAX);
-  for (int dx = 0; dx < w_out; dx++) {
+  for (int dx = 0; dx < w_out / 2; dx++) {
     fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
     sx = floor(fx);
     fx -= sx;
@@ -334,6 +379,7 @@ void resize_one_channel(const uint8_t* src,
     ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
     ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
   }
+
 #undef SATURATE_CAST_SHORT
   // loop body
   int16_t* rowsbuf0 = new int16_t[w_out + 1];
@@ -344,22 +390,23 @@ void resize_one_channel(const uint8_t* src,
   int prev_sy1 = -1;
   for (int dy = 0; dy < h_out; dy++) {
     int sy = yofs[dy];
-
     if (sy == prev_sy1) {
       // hresize one row
       int16_t* rows0_old = rows0;
       rows0 = rows1;
       rows1 = rows0_old;
       const uint8_t* S1 = src + w_in * (sy + 1);
+
       const int16_t* ialphap = ialpha;
       int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out; dx++) {
-        int sx = xofs[dx];
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
         int16_t a0 = ialphap[0];
         int16_t a1 = ialphap[1];
-
         const uint8_t* S1p = S1 + sx;
-        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+        int tmp = dx * 2;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
 
         ialphap += 2;
       }
@@ -371,20 +418,22 @@ void resize_one_channel(const uint8_t* src,
       const int16_t* ialphap = ialpha;
       int16_t* rows0p = rows0;
       int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out; dx++) {
-        int sx = xofs[dx];
+      for (int dx = 0; dx < w_out / 2; dx++) {
+        int sx = xofs[dx] * 2;
         int16_t a0 = ialphap[0];
         int16_t a1 = ialphap[1];
 
         const uint8_t* S0p = S0 + sx;
         const uint8_t* S1p = S1 + sx;
-        rows0p[dx] = (S0p[0] * a0 + S0p[1] * a1) >> 4;
-        rows1p[dx] = (S1p[0] * a0 + S1p[1] * a1) >> 4;
+        int tmp = dx * 2;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
 
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
         ialphap += 2;
       }
     }
-
     prev_sy1 = sy + 1;
 
     // vresize
@@ -400,7 +449,6 @@ void resize_one_channel(const uint8_t* src,
     int16x4_t _b0 = vdup_n_s16(b0);
     int16x4_t _b1 = vdup_n_s16(b1);
     int32x4_t _v2 = vdupq_n_s32(2);
-
     for (cnt = w_out >> 3; cnt > 0; cnt--) {
       int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
       int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
@@ -413,14 +461,15 @@ void resize_one_channel(const uint8_t* src,
       int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
 
       int32x4_t _acc = _v2;
-      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(
+          _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
       _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
 
       int32x4_t _acc_1 = _v2;
       _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
       _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
 
-      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
       int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
 
       uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
@@ -446,42 +495,35 @@ void resize_one_channel(const uint8_t* src,
   delete[] rowsbuf1;
 }
 
-void resize_one_channel_uv(const uint8_t* src,
-                           int w_in,
-                           int h_in,
-                           uint8_t* dst,
-                           int w_out,
-                           int h_out) {
+void resize_three_channel(const uint8_t* src,
+                          int w_in,
+                          int h_in,
+                          uint8_t* dst,
+                          int w_out,
+                          int h_out) {
   const int resize_coef_bits = 11;
   const int resize_coef_scale = 1 << resize_coef_bits;
-
   double scale_x = static_cast<double>(w_in) / w_out;
   double scale_y = static_cast<double>(h_in) / h_out;
-
   int* buf = new int[w_out * 2 + h_out * 2];
-
   int* xofs = buf;          // new int[w];
   int* yofs = buf + w_out;  // new int[h];
-
   int16_t* ialpha =
       reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
-  int16_t* ibeta = reinterpret_cast<int16_t*>(buf + w_out * 2 +
-                                              h_out);  // new int16_t[h * 2];
-
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
   float fx = 0.f;
   float fy = 0.f;
   int sx = 0.f;
   int sy = 0.f;
-
 #define SATURATE_CAST_SHORT(X)                                               \
   (int16_t)::std::min(                                                       \
       ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
       SHRT_MAX);
-  for (int dx = 0; dx < w_out / 2; dx++) {
+  for (int dx = 0; dx < w_out / 3; dx++) {
     fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
     sx = floor(fx);
     fx -= sx;
-
     if (sx < 0) {
       sx = 0;
       fx = 0.f;
@@ -490,12 +532,9 @@ void resize_one_channel_uv(const uint8_t* src,
       sx = w_in - 2;
       fx = 1.f;
     }
-
-    xofs[dx] = sx;
-
+    xofs[dx] = sx * 3;
     float a0 = (1.f - fx) * resize_coef_scale;
     float a1 = fx * resize_coef_scale;
-
     ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
     ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
   }
@@ -503,7 +542,6 @@ void resize_one_channel_uv(const uint8_t* src,
     fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
     sy = floor(fy);
     fy -= sy;
-
     if (sy < 0) {
       sy = 0;
       fy = 0.f;
@@ -512,23 +550,18 @@ void resize_one_channel_uv(const uint8_t* src,
       sy = h_in - 2;
       fy = 1.f;
     }
-
     yofs[dy] = sy;
-
     float b0 = (1.f - fy) * resize_coef_scale;
     float b1 = fy * resize_coef_scale;
-
     ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
     ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
   }
-
 #undef SATURATE_CAST_SHORT
   // loop body
   int16_t* rowsbuf0 = new int16_t[w_out + 1];
   int16_t* rowsbuf1 = new int16_t[w_out + 1];
   int16_t* rows0 = rowsbuf0;
   int16_t* rows1 = rowsbuf1;
-
   int prev_sy1 = -1;
   for (int dy = 0; dy < h_out; dy++) {
     int sy = yofs[dy];
@@ -538,54 +571,49 @@ void resize_one_channel_uv(const uint8_t* src,
       rows0 = rows1;
       rows1 = rows0_old;
       const uint8_t* S1 = src + w_in * (sy + 1);
-
       const int16_t* ialphap = ialpha;
       int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 2; dx++) {
-        int sx = xofs[dx] * 2;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
         int16_t a0 = ialphap[0];
         int16_t a1 = ialphap[1];
         const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 2;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
-
+        int tmp = dx * 3;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
         ialphap += 2;
       }
     } else {
       // hresize two rows
       const uint8_t* S0 = src + w_in * (sy);
       const uint8_t* S1 = src + w_in * (sy + 1);
-
       const int16_t* ialphap = ialpha;
       int16_t* rows0p = rows0;
       int16_t* rows1p = rows1;
-      for (int dx = 0; dx < w_out / 2; dx++) {
-        int sx = xofs[dx] * 2;
+      for (int dx = 0; dx < w_out / 3; dx++) {
+        int sx = xofs[dx];
         int16_t a0 = ialphap[0];
         int16_t a1 = ialphap[1];
-
         const uint8_t* S0p = S0 + sx;
         const uint8_t* S1p = S1 + sx;
-        int tmp = dx * 2;
-        rows0p[tmp] = (S0p[0] * a0 + S0p[2] * a1) >> 4;
-        rows1p[tmp] = (S1p[0] * a0 + S1p[2] * a1) >> 4;
-
-        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[3] * a1) >> 4;
-        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[3] * a1) >> 4;
+        int tmp = dx * 3;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[3] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[3] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[5] * a1) >> 4;
         ialphap += 2;
       }
     }
     prev_sy1 = sy + 1;
-
     // vresize
     int16_t b0 = ibeta[0];
     int16_t b1 = ibeta[1];
-
     int16_t* rows0p = rows0;
     int16_t* rows1p = rows1;
     uint8_t* dp_ptr = dst + w_out * (dy);
-
     int cnt = w_out >> 3;
     int remain = w_out - (cnt << 3);
     int16x4_t _b0 = vdup_n_s16(b0);
@@ -596,28 +624,21 @@ void resize_one_channel_uv(const uint8_t* src,
       int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
       int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
       int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
-
       int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
       int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
       int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
       int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
-
       int32x4_t _acc = _v2;
       _acc = vsraq_n_s32(
           _acc, _rows0p_sr4_mb0, 16);  // _acc >> 16 + _rows0p_sr4_mb0 >> 16
       _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
-
       int32x4_t _acc_1 = _v2;
       _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
       _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
-
       int16x4_t _acc16 = vshrn_n_s32(_acc, 2);  // _acc >> 2
       int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
-
       uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
-
       vst1_u8(dp_ptr, _dout);
-
       dp_ptr += 8;
       rows0p += 8;
       rows1p += 8;
@@ -631,7 +652,172 @@ void resize_one_channel_uv(const uint8_t* src,
     }
     ibeta += 2;
   }
+  delete[] buf;
+  delete[] rowsbuf0;
+  delete[] rowsbuf1;
+}
 
+void resize_four_channel(const uint8_t* src,
+                         int w_in,
+                         int h_in,
+                         uint8_t* dst,
+                         int w_out,
+                         int h_out) {
+  const int resize_coef_bits = 11;
+  const int resize_coef_scale = 1 << resize_coef_bits;
+  double scale_x = static_cast<double>(w_in) / w_out;
+  double scale_y = static_cast<double>(h_in) / h_out;
+  int* buf = new int[w_out * 2 + h_out * 2];
+  int* xofs = buf;          // new int[w];
+  int* yofs = buf + w_out;  // new int[h];
+  int16_t* ialpha =
+      reinterpret_cast<int16_t*>(buf + w_out + h_out);  // new int16_t[w * 2];
+  int16_t* ibeta =
+      reinterpret_cast<int16_t*>(buf + w_out * 2 + h_out);  // new short[h * 2];
+  float fx = 0.f;
+  float fy = 0.f;
+  int sx = 0.f;
+  int sy = 0.f;
+#define SATURATE_CAST_SHORT(X)                                               \
+  (int16_t)::std::min(                                                       \
+      ::std::max(static_cast<int>(X + (X >= 0.f ? 0.5f : -0.5f)), SHRT_MIN), \
+      SHRT_MAX);
+  for (int dx = 0; dx < w_out / 4; dx++) {
+    fx = static_cast<float>((dx + 0.5) * scale_x - 0.5);
+    sx = floor(fx);
+    fx -= sx;
+    if (sx < 0) {
+      sx = 0;
+      fx = 0.f;
+    }
+    if (sx >= w_in - 1) {
+      sx = w_in - 2;
+      fx = 1.f;
+    }
+    xofs[dx] = sx * 4;
+    float a0 = (1.f - fx) * resize_coef_scale;
+    float a1 = fx * resize_coef_scale;
+    ialpha[dx * 2] = SATURATE_CAST_SHORT(a0);
+    ialpha[dx * 2 + 1] = SATURATE_CAST_SHORT(a1);
+  }
+  for (int dy = 0; dy < h_out; dy++) {
+    fy = static_cast<float>((dy + 0.5) * scale_y - 0.5);
+    sy = floor(fy);
+    fy -= sy;
+    if (sy < 0) {
+      sy = 0;
+      fy = 0.f;
+    }
+    if (sy >= h_in - 1) {
+      sy = h_in - 2;
+      fy = 1.f;
+    }
+    yofs[dy] = sy;
+    float b0 = (1.f - fy) * resize_coef_scale;
+    float b1 = fy * resize_coef_scale;
+    ibeta[dy * 2] = SATURATE_CAST_SHORT(b0);
+    ibeta[dy * 2 + 1] = SATURATE_CAST_SHORT(b1);
+  }
+#undef SATURATE_CAST_SHORT
+  // loop body
+  int16_t* rowsbuf0 = new int16_t[w_out + 1];
+  int16_t* rowsbuf1 = new int16_t[w_out + 1];
+  int16_t* rows0 = rowsbuf0;
+  int16_t* rows1 = rowsbuf1;
+  int prev_sy1 = -1;
+  for (int dy = 0; dy < h_out; dy++) {
+    int sy = yofs[dy];
+    if (sy == prev_sy1) {
+      // hresize one row
+      int16_t* rows0_old = rows0;
+      rows0 = rows1;
+      rows1 = rows0_old;
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    } else {
+      // hresize two rows
+      const uint8_t* S0 = src + w_in * (sy);
+      const uint8_t* S1 = src + w_in * (sy + 1);
+      const int16_t* ialphap = ialpha;
+      int16_t* rows0p = rows0;
+      int16_t* rows1p = rows1;
+      for (int dx = 0; dx < w_out / 4; dx++) {
+        int sx = xofs[dx];
+        int16_t a0 = ialphap[0];
+        int16_t a1 = ialphap[1];
+        const uint8_t* S0p = S0 + sx;
+        const uint8_t* S1p = S1 + sx;
+        int tmp = dx * 4;
+        rows0p[tmp] = (S0p[0] * a0 + S0p[4] * a1) >> 4;
+        rows1p[tmp] = (S1p[0] * a0 + S1p[4] * a1) >> 4;
+        rows0p[tmp + 1] = (S0p[1] * a0 + S0p[5] * a1) >> 4;
+        rows1p[tmp + 1] = (S1p[1] * a0 + S1p[5] * a1) >> 4;
+        rows0p[tmp + 2] = (S0p[2] * a0 + S0p[6] * a1) >> 4;
+        rows1p[tmp + 2] = (S1p[2] * a0 + S1p[6] * a1) >> 4;
+        rows0p[tmp + 3] = (S0p[3] * a0 + S0p[7] * a1) >> 4;
+        rows1p[tmp + 3] = (S1p[3] * a0 + S1p[7] * a1) >> 4;
+        ialphap += 2;
+      }
+    }
+    prev_sy1 = sy + 1;
+    // vresize
+    int16_t b0 = ibeta[0];
+    int16_t b1 = ibeta[1];
+    int16_t* rows0p = rows0;
+    int16_t* rows1p = rows1;
+    uint8_t* dp_ptr = dst + w_out * (dy);
+    int cnt = w_out >> 3;
+    int remain = w_out - (cnt << 3);
+    int16x4_t _b0 = vdup_n_s16(b0);
+    int16x4_t _b1 = vdup_n_s16(b1);
+    int32x4_t _v2 = vdupq_n_s32(2);
+    for (cnt = w_out >> 3; cnt > 0; cnt--) {
+      int16x4_t _rows0p_sr4 = vld1_s16(rows0p);
+      int16x4_t _rows1p_sr4 = vld1_s16(rows1p);
+      int16x4_t _rows0p_1_sr4 = vld1_s16(rows0p + 4);
+      int16x4_t _rows1p_1_sr4 = vld1_s16(rows1p + 4);
+      int32x4_t _rows0p_sr4_mb0 = vmull_s16(_rows0p_sr4, _b0);
+      int32x4_t _rows1p_sr4_mb1 = vmull_s16(_rows1p_sr4, _b1);
+      int32x4_t _rows0p_1_sr4_mb0 = vmull_s16(_rows0p_1_sr4, _b0);
+      int32x4_t _rows1p_1_sr4_mb1 = vmull_s16(_rows1p_1_sr4, _b1);
+      int32x4_t _acc = _v2;
+      // _acc >> 16 + _rows0p_sr4_mb0 >> 16
+      _acc = vsraq_n_s32(_acc, _rows0p_sr4_mb0, 16);
+      _acc = vsraq_n_s32(_acc, _rows1p_sr4_mb1, 16);
+      int32x4_t _acc_1 = _v2;
+      _acc_1 = vsraq_n_s32(_acc_1, _rows0p_1_sr4_mb0, 16);
+      _acc_1 = vsraq_n_s32(_acc_1, _rows1p_1_sr4_mb1, 16);
+      // _acc >> 2
+      int16x4_t _acc16 = vshrn_n_s32(_acc, 2);
+      int16x4_t _acc16_1 = vshrn_n_s32(_acc_1, 2);
+      uint8x8_t _dout = vqmovun_s16(vcombine_s16(_acc16, _acc16_1));
+      vst1_u8(dp_ptr, _dout);
+      dp_ptr += 8;
+      rows0p += 8;
+      rows1p += 8;
+    }
+    for (; remain; --remain) {
+      // D[x] = (rows0[x]*b0 + rows1[x]*b1) >> INTER_RESIZE_COEF_BITS;
+      *dp_ptr++ =
+          (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
+                     (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
+                    2);
+    }
+    ibeta += 2;
+  }
   delete[] buf;
   delete[] rowsbuf0;
   delete[] rowsbuf1;
@@ -648,6 +834,7 @@ void compute_xy(int srcw,
                 int* yofs,
                 int16_t* ialpha,
                 int16_t* ibeta);
+
 // use bilinear method to resize
 void resize(const uint8_t* src,
             uint8_t* dst,
@@ -682,9 +869,8 @@ void resize(const uint8_t* src,
     bgr_resize(src, dst, srcw, srch, dstw, dsth);
     return;
   } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    w_in = srcw * 4;
-    w_out = dstw * 4;
-    num = 4;
+    bgra_resize(src, dst, srcw, srch, dstw, dsth);
+    return;
   }
   double scale_x = static_cast<double>(srcw) / dstw;
   double scale_y = static_cast<double>(srch) / dsth;
@@ -701,23 +887,6 @@ void resize(const uint8_t* src,
   int* xofs1 = nullptr;
   int* yofs1 = nullptr;
   int16_t* ialpha1 = nullptr;
-  if (orih < dsth) {  // uv
-    int tmp = dsth - orih;
-    xofs1 = new int[dstw];
-    yofs1 = new int[tmp];
-    ialpha1 = new int16_t[dstw];
-    compute_xy(srcw,
-               srch / 2,
-               dstw / 2,
-               tmp,
-               2,
-               scale_x,
-               scale_y,
-               xofs1,
-               yofs1,
-               ialpha1,
-               ibeta + orih * 2);
-  }
   int cnt = w_out >> 3;
   int remain = w_out % 8;
   int32x4_t _v2 = vdupq_n_s32(2);
@@ -727,13 +896,6 @@ void resize(const uint8_t* src,
 #pragma omp parallel for
   for (int dy = 0; dy < dsth; dy++) {
     int sy = yofs[dy];
-    if (dy >= orih) {
-      xofs = xofs1;
-      yofs = yofs1;
-      ialpha = ialpha1;
-      num = 2;
-      sy = yofs1[dy - orih] + srch;
-    }
 
     // hresize two rows
     const uint8_t* S0 = src + w_in * (sy);
@@ -850,11 +1012,6 @@ void resize(const uint8_t* src,
     }
     ibeta += 2;
   }
-  if (orih < dsth) {  // uv
-    delete[] xofs1;
-    delete[] yofs1;
-    delete[] ialpha1;
-  }
   delete[] buf;
   delete[] rowsbuf0;
   delete[] rowsbuf1;
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index c1ac41e1394357bed160c28fe7113146ac02b3d9..848b9813d5f690971909a288edaf8f32a85c739a 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -39,7 +39,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
   this->dstFormat_ = dstFormat;
   this->transParam_ = param;
 }
-__attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
+__attribute__((visibility("default"))) void ImagePreprocess::image_convert(
     const uint8_t* src, uint8_t* dst) {
   ImageConvert img_convert;
   img_convert.choose(src,
@@ -50,7 +50,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
                      this->transParam_.ih);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
+__attribute__((visibility("default"))) void ImagePreprocess::image_convert(
     const uint8_t* src,
     uint8_t* dst,
     ImageFormat srcFormat,
@@ -64,7 +64,18 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageConvert(
                      this->transParam_.ih);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageResize(
+__attribute__((visibility("default"))) void ImagePreprocess::image_convert(
+    const uint8_t* src,
+    uint8_t* dst,
+    ImageFormat srcFormat,
+    ImageFormat dstFormat,
+    int srcw,
+    int srch) {
+  ImageConvert img_convert;
+  img_convert.choose(src, dst, srcFormat, dstFormat, srcw, srch);
+}
+
+__attribute__((visibility("default"))) void ImagePreprocess::image_resize(
     const uint8_t* src,
     uint8_t* dst,
     ImageFormat srcFormat,
@@ -76,7 +87,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize(
   img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageResize(
+__attribute__((visibility("default"))) void ImagePreprocess::image_resize(
     const uint8_t* src, uint8_t* dst) {
   int srcw = this->transParam_.iw;
   int srch = this->transParam_.ih;
@@ -87,7 +98,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageResize(
   img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
+__attribute__((visibility("default"))) void ImagePreprocess::image_rotate(
     const uint8_t* src,
     uint8_t* dst,
     ImageFormat srcFormat,
@@ -98,7 +109,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
   img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
+__attribute__((visibility("default"))) void ImagePreprocess::image_rotate(
     const uint8_t* src, uint8_t* dst) {
   auto srcw = this->transParam_.ow;
   auto srch = this->transParam_.oh;
@@ -108,7 +119,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageRotate(
   img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
+__attribute__((visibility("default"))) void ImagePreprocess::image_flip(
     const uint8_t* src,
     uint8_t* dst,
     ImageFormat srcFormat,
@@ -119,7 +130,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
   img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
+__attribute__((visibility("default"))) void ImagePreprocess::image_flip(
     const uint8_t* src, uint8_t* dst) {
   auto srcw = this->transParam_.ow;
   auto srch = this->transParam_.oh;
@@ -129,7 +140,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::imageFlip(
   img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
+__attribute__((visibility("default"))) void ImagePreprocess::image_to_tensor(
     const uint8_t* src,
     Tensor* dstTensor,
     ImageFormat srcFormat,
@@ -143,7 +154,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
       src, dstTensor, srcFormat, layout, srcw, srch, means, scales);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
+__attribute__((visibility("default"))) void ImagePreprocess::image_to_tensor(
     const uint8_t* src,
     Tensor* dstTensor,
     LayoutType layout,
@@ -160,7 +171,7 @@ __attribute__((visibility("default"))) void ImagePreprocess::image2Tensor(
                     scales);
 }
 
-__attribute__((visibility("default"))) void ImagePreprocess::imageCrop(
+__attribute__((visibility("default"))) void ImagePreprocess::image_crop(
     const uint8_t* src,
     uint8_t* dst,
     ImageFormat srcFormat,
diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h
index f7b54bdbbb17d4d49842e19d67b8c5b2001c9d68..8183d86d1bfc313d2688edbe0138d82961739131 100644
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -75,7 +75,8 @@ class ImagePreprocess {
   * param src: input image data
   * param dst: output image data
   */
-  void imageConvert(const uint8_t* src, uint8_t* dst);
+  void image_convert(const uint8_t* src, uint8_t* dst);
+
   /*
   * image color convert
   * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
@@ -91,10 +92,35 @@ class ImagePreprocess {
   * param dstFormat: output image image format, support GRAY, BGR(RGB) and
   * BGRA(RGBA)
   */
-  void imageConvert(const uint8_t* src,
-                    uint8_t* dst,
-                    ImageFormat srcFormat,
-                    ImageFormat dstFormat);
+  void image_convert(const uint8_t* src,
+                     uint8_t* dst,
+                     ImageFormat srcFormat,
+                     ImageFormat dstFormat);
+
+  /*
+  * image color convert
+  * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
+  * BGR(RGB)and BGRA(RGBA) transform,
+  * BGR(RGB)and RGB(BGR) transform,
+  * BGR(RGB)and RGBA(BGRA) transform,
+  * BGR(RGB)and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
+  * param src: input image data
+  * param dst: output image data
+  * param srcFormat: input image image format support: GRAY, NV12(NV21),
+  * BGR(RGB) and BGRA(RGBA)
+  * param dstFormat: output image image format, support GRAY, BGR(RGB) and
+  * BGRA(RGBA)
+  * param srcw: input image width
+  * param srch: input image height
+  */
+  void image_convert(const uint8_t* src,
+                     uint8_t* dst,
+                     ImageFormat srcFormat,
+                     ImageFormat dstFormat,
+                     int srcw,
+                     int srch);
+
   /*
   * image resize, use bilinear method
   * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -102,7 +128,8 @@ class ImagePreprocess {
   * param src: input image data
   * param dst: output image data
   */
-  void imageResize(const uint8_t* src, uint8_t* dst);
+  void image_resize(const uint8_t* src, uint8_t* dst);
+
   /*
    image resize, use bilinear method
   * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -114,13 +141,13 @@ class ImagePreprocess {
   * param dstw: output image width
   * param dsth: output image height
   */
-  void imageResize(const uint8_t* src,
-                   uint8_t* dst,
-                   ImageFormat srcFormat,
-                   int srcw,
-                   int srch,
-                   int dstw,
-                   int dsth);
+  void image_resize(const uint8_t* src,
+                    uint8_t* dst,
+                    ImageFormat srcFormat,
+                    int srcw,
+                    int srch,
+                    int dstw,
+                    int dsth);
 
   /*
   * image Rotate
@@ -129,7 +156,8 @@ class ImagePreprocess {
   * param src: input image data
   * param dst: output image data
   */
-  void imageRotate(const uint8_t* src, uint8_t* dst);
+  void image_rotate(const uint8_t* src, uint8_t* dst);
+
   /*
   * image Rotate
   * support 90, 180 and 270 Rotate process
@@ -141,12 +169,13 @@ class ImagePreprocess {
   * param srch: input image height
   * param degree: Rotate degree, support 90, 180 and 270
   */
-  void imageRotate(const uint8_t* src,
-                   uint8_t* dst,
-                   ImageFormat srcFormat,
-                   int srcw,
-                   int srch,
-                   float degree);
+  void image_rotate(const uint8_t* src,
+                    uint8_t* dst,
+                    ImageFormat srcFormat,
+                    int srcw,
+                    int srch,
+                    float degree);
+
   /*
   * image Flip
   * support X, Y and XY flip process
@@ -154,7 +183,8 @@ class ImagePreprocess {
   * param src: input image data
   * param dst: output image data
   */
-  void imageFlip(const uint8_t* src, uint8_t* dst);
+  void image_flip(const uint8_t* src, uint8_t* dst);
+
   /*
   * image Flip
   * support X, Y and XY flip process
@@ -166,12 +196,13 @@ class ImagePreprocess {
   * param srch: input image height
   * param flip_param: flip parameter, support X, Y and XY
   */
-  void imageFlip(const uint8_t* src,
-                 uint8_t* dst,
-                 ImageFormat srcFormat,
-                 int srcw,
-                 int srch,
-                 FlipParam flip_param);
+  void image_flip(const uint8_t* src,
+                  uint8_t* dst,
+                  ImageFormat srcFormat,
+                  int srcw,
+                  int srch,
+                  FlipParam flip_param);
+
   /*
   * change image data to tensor data
   * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
@@ -183,11 +214,12 @@ class ImagePreprocess {
   * param means: means of image
   * param scales: scales of image
   */
-  void image2Tensor(const uint8_t* src,
-                    Tensor* dstTensor,
-                    LayoutType layout,
-                    float* means,
-                    float* scales);
+  void image_to_tensor(const uint8_t* src,
+                       Tensor* dstTensor,
+                       LayoutType layout,
+                       float* means,
+                       float* scales);
+
   /*
   * change image data to tensor data
   * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
@@ -202,14 +234,14 @@ class ImagePreprocess {
   * param means: means of image
   * param scales: scales of image
   */
-  void image2Tensor(const uint8_t* src,
-                    Tensor* dstTensor,
-                    ImageFormat srcFormat,
-                    int srcw,
-                    int srch,
-                    LayoutType layout,
-                    float* means,
-                    float* scales);
+  void image_to_tensor(const uint8_t* src,
+                       Tensor* dstTensor,
+                       ImageFormat srcFormat,
+                       int srcw,
+                       int srch,
+                       LayoutType layout,
+                       float* means,
+                       float* scales);
 
   /*
   * image crop process
@@ -217,15 +249,15 @@ class ImagePreprocess {
   * param src: input image data
   * param dst: output image data
   */
-  void imageCrop(const uint8_t* src,
-                 uint8_t* dst,
-                 ImageFormat srcFormat,
-                 int srcw,
-                 int srch,
-                 int left_x,
-                 int left_y,
-                 int dstw,
-                 int dsth);
+  void image_crop(const uint8_t* src,
+                  uint8_t* dst,
+                  ImageFormat srcFormat,
+                  int srcw,
+                  int srch,
+                  int left_x,
+                  int left_y,
+                  int dstw,
+                  int dsth);
 
  private:
   ImageFormat srcFormat_;
diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc
index cc5a5b408a9517cd657c8129cbe69b5e439a194f..768d4e0972c07f950a482aeecf1aa09c41b9b409 100644
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
@@ -35,7 +35,6 @@ void gen_log(STL::ostream& log_stream_,
              const int kMaxLen) {
   const int len = strlen(file);
 
-  std::string time_str;
   struct tm tm_time;  // Time of creation of LogMessage
   time_t timestamp = time(NULL);
 #if defined(_WIN32)
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index c7fa8d4cf113abebb29c4ebe972e243a39573cf0..731ba7ad719ce3d7a1c56c7707bb255c5463824a 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -189,7 +189,9 @@ class LogMessageFatal : public LogMessage {
 #ifndef LITE_ON_TINY_PUBLISH
     abort();
 #else
-    assert(false);
+    // If we decide whether the process exits according to the NDEBUG macro
+    // definition, assert() can be used here.
+    abort();
 #endif
 #endif
   }
@@ -250,7 +252,11 @@ class VoidifyFatal : public Voidify {
 #ifdef LITE_WITH_EXCEPTION
   ~VoidifyFatal() noexcept(false) { throw std::exception(); }
 #else
-  ~VoidifyFatal() { assert(false); }
+  ~VoidifyFatal() {
+    // If we decide whether the process exits according to the NDEBUG macro
+    // definition, assert() can be used here.
+    abort();
+  }
 #endif
 };
 
diff --git a/lite/utils/macros.h b/lite/utils/macros.h
index 5c2f85e92cd7c16f5aabe1b46af90c4584440a8d..632a99627acab3e973121358a786b23a3b7f4bd4 100644
--- a/lite/utils/macros.h
+++ b/lite/utils/macros.h
@@ -59,8 +59,11 @@
 // Thread local storage will be ignored because the linker for iOS 8 does not
 // support it.
 #define LITE_THREAD_LOCAL
-#elif __cplusplus >= 201103
+#elif defined(__cplusplus) && (__cplusplus >= 201103)
+#define LITE_THREAD_LOCAL thread_local
+#elif defined(_WIN32)
+// The MSVC compiler does not support standards switches for C++11.
 #define LITE_THREAD_LOCAL thread_local
 #else
-#error "C++11 support is required for paddle-lite compilation."
+#error "[Paddle-Lite] C++11 support is required for paddle-lite compilation."
 #endif
diff --git a/mobile/.clang-format b/mobile/.clang-format
deleted file mode 100644
index d59e0885794e037ab02cd1e385cc8c16b93d3a76..0000000000000000000000000000000000000000
--- a/mobile/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle: Google 
-Standard:  Cpp11 
-...
diff --git a/mobile/.clang-tidy b/mobile/.clang-tidy
deleted file mode 100644
index c788efe69d23e69ee6add3b0be9e09e567494662..0000000000000000000000000000000000000000
--- a/mobile/.clang-tidy
+++ /dev/null
@@ -1,67 +0,0 @@
-Checks: >
-  *
-  -android-*
-  -bugprone-bool-pointer-implicit-conversion
-  -cert-env33-c
-  -cert-dcl50-cpp
-  -cert-dcl59-cpp
-  -cppcoreguidelines-*
-  -fuchsia-*
-  -google-*
-  google-default-arguments
-  google-explicit-constructor
-  google-runtime-member-string-references
-  google-runtime-operator
-  -hicpp-braces-around-statements
-  -hicpp-named-parameter
-  -hicpp-no-array-decay
-  -hicpp-no-assembler
-  -hicpp-no-malloc
-  -hicpp-function-size
-  -hicpp-special-member-functions
-  -hicpp-vararg
-  -llvm-*
-  -objc-*
-  -readability-else-after-return
-  -readability-implicit-bool-conversion
-  -readability-named-parameter
-  -readability-simplify-boolean-expr
-  -readability-braces-around-statements
-  -readability-identifier-naming
-  -readability-function-size
-  -readability-redundant-member-init
-  -misc-bool-pointer-implicit-conversion
-  -misc-definitions-in-headers
-  -misc-unused-alias-decls
-  -misc-unused-parameters
-  -misc-unused-using-decls
-  -modernize-use-using
-  -modernize-use-default-member-init
-  -clang-diagnostic-*
-  -clang-analyzer-*
-WarningsAsErrors: '*'
-HeaderFilterRegex: ''
-AnalyzeTemporaryDtors: false
-FormatStyle:     none
-User:            allonli
-CheckOptions:    
-  - key:             google-readability-braces-around-statements.ShortStatementLines
-    value:           '1'
-  - key:             google-readability-function-size.StatementThreshold
-    value:           '800'
-  - key:             google-readability-namespace-comments.ShortNamespaceLines
-    value:           '10'
-  - key:             google-readability-namespace-comments.SpacesBeforeComments
-    value:           '2'
-  - key:             modernize-loop-convert.MaxCopySize
-    value:           '16'
-  - key:             modernize-loop-convert.MinConfidence
-    value:           reasonable
-  - key:             modernize-loop-convert.NamingStyle
-    value:           CamelCase
-  - key:             modernize-pass-by-value.IncludeStyle
-    value:           llvm
-  - key:             modernize-replace-auto-ptr.IncludeStyle
-    value:           llvm
-  - key:             modernize-use-nullptr.NullMacros
-    value:           'NULL'
diff --git a/mobile/.gitignore b/mobile/.gitignore
deleted file mode 100644
index 336f08fa8a83780b790a4114182472caa62bbc53..0000000000000000000000000000000000000000
--- a/mobile/.gitignore
+++ /dev/null
@@ -1,104 +0,0 @@
-opencl_kernels.cpp
-# Prerequisites
-*.d
-
-# Compiled Object files
-*.slo
-*.lo
-*.o
-*.obj
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Compiled Dynamic libraries
-*.so
-*.dylib
-*.dll
-
-# Fortran module files
-*.mod
-*.smod
-
-# Compiled Static libraries
-*.lai
-*.la
-*.lib
-*.a
-
-# Executables
-*.exe
-*.out
-*.app
-
-.DS_Store
-
-build/
-
-.idea/
-
-CMakeCache.txt
-
-CMakeFiles/
-
-Makefile
-
-cmake_install.cmake
-
-
-*.cbp
-
-paddle-mobile.cbp
-
-.idea
-
-compile_commands.json
-
-cmake-build-debug/
-cmake-build-release/
-
-test/models/
-
-test/images/
-
-# Emacs intermediate files
-*~
-
-# CMake building directory
-build
-
-# clion building directories
-cmake-build-debug
-cmake-build-release
-
-# ios
-tools/libomp.a
-
-# ios demo
-demo/ios/PaddleMobileDemo/PaddleMobileDemo/googlenet_combine/
-demo/ios/PaddleMobileDemo/PaddleMobileDemo/*.jpg
-demo/ios/PaddleMobileDemo/PaddleMobileDemo/PaddleMobile/*.a
-*.xcuserstate
-/tools/quantification/quantify
-
-# metal
-Podfile.lock
-metal/Pods/
-SwiftProtobuf.framework
-paddle-mobile.xcworkspace
-metal/models/
-metal/images/
-*.a
-metal/paddle-mobile/paddle-mobile/CPU/libpaddle-mobile.a
-*.xcuserdatad/
-*/xcuserdata/
-/venv/
-
-metal/paddle-mobile-demo/paddle-mobile-demo/images
-metal/paddle-mobile-demo/paddle-mobile-demo/models
-metal/paddle-mobile-demo/paddle-mobile-demo/Resources
-metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
-metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
-metal/MobileNetDemo/MobileNetDemo/Resources
-third_party/opencl/OpenCL-Headers
diff --git a/mobile/.pre-commit-config.yaml b/mobile/.pre-commit-config.yaml
deleted file mode 100644
index d9827afcd0ce2b7b8ce5aacd35f0d5a06fe9af3a..0000000000000000000000000000000000000000
--- a/mobile/.pre-commit-config.yaml
+++ /dev/null
@@ -1,69 +0,0 @@
-repos:
--   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: v1.0.1
-    hooks:
-    -   id: remove-crlf
-        files: ^(mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
-        exclude: ^(lite/)
-    -   id: remove-tabs
-        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|cu|h|hpp|hxx)$
-        exclude: ^(lite/)
-
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
-    hooks:
-    -   id: check-added-large-files
-        exclude: ^(lite/)
-    -   id: check-merge-conflict
-        exclude: ^(lite/)
-    -   id: check-symlinks
-        exclude: ^(lite/)
-    -   id: detect-private-key
-        files: (?!.*tar.gz)^.*$ 
-        exclude: ^(lite/)
-    -   id: end-of-file-fixer
-        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: ^(lite/)
-    -   id: trailing-whitespace
-        files: ^(mobile/test/|mobile/src/).*\.(md|py|mm|swift|java|c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: ^(lite/)
-
--   repo: local
-    hooks:
-    -   id: copyright
-        name: copyright
-        entry: python ./mobile/tools/pre-commit.hooks/copyright.hook
-        language: system
-        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx|py)$
-        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$ | ^(lite/)
-
--   repo: local
-    hooks:
-    -   id: clang-format
-        name: clang-format
-        description: Format files with ClangFormat.
-        entry: bash ./mobile/tools/pre-commit.hooks/clang-format.hook -i
-        language: system
-        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: ^(lite/)
-
--   repo: local
-    hooks:
-    -   id: cpplint
-        name: cpplint
-        description: Check C++ code style using cpplint.
-        entry: bash ./mobile/tools/pre-commit.hooks/cpplint.hook
-        language: system
-        files: ^(mobile/test/|mobile/src/).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
-        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$i | *\.pb\.cpp | ^(lite/)
-
-
-#
-#-   repo: local
-#    hooks:
-#    -   id: clang-tidy
-#        name: clang-tidy
-#        description: Check C++ code style using clang-tidy.
-#        entry: bash ./tools/pre-commit.hooks/.clang-tidy.hook -i
-#        language: system
-#        files: (src).*\.(c|cc|cxx|cpp|h|hpp|hxx)$
diff --git a/mobile/.travis.yml b/mobile/.travis.yml
deleted file mode 100644
index 20fdddd5a172d63b6b3df3fb2a57265a08ed3732..0000000000000000000000000000000000000000
--- a/mobile/.travis.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-language: cpp
-cache: ccache
-sudo: required
-dist: trusty
-
-os:
-  - linux
-
-addons:
-  apt:
-    packages:
-      - git
-      - python
-      - python-pip
-      - python2.7-dev
-      - libc6-i386
-      - curl
-
-compiler:
-  - clang
-        
-before_install:
-  - sudo pip install -U virtualenv pre-commit pip
-  # Download and install recent cmake
-
-script:
-  - | 
-    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
-  - |
-    timeout 600 .travis/pre-commit-job.sh # 10min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else exit 1; fi;
-
-notifications:
-  email:
-    on_success: change
-    on_failure: always
diff --git a/mobile/.travis/pre-commit-job.sh b/mobile/.travis/pre-commit-job.sh
deleted file mode 100755
index a0ae98dddd27a7f24467ce2ce441aba9e4ffe156..0000000000000000000000000000000000000000
--- a/mobile/.travis/pre-commit-job.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your change doesn't follow Paddle-Moible's code style" 1>&2
-    echo "Please use pre-commit to auto-format your code." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-cd `dirname $0`
-cd ..
-export PATH=/usr/bin:$PATH
-pre-commit install
-
-if ! pre-commit run -a ; then
-  ls -lh
-  git diff  --exit-code
-  exit 1
-fi
-
-trap : 0
diff --git a/mobile/CMakeLists.txt b/mobile/CMakeLists.txt
deleted file mode 100644
index 1883da85739f15ada96fead77a02b72b3bcceb6a..0000000000000000000000000000000000000000
--- a/mobile/CMakeLists.txt
+++ /dev/null
@@ -1,293 +0,0 @@
-cmake_minimum_required(VERSION 3.0.0)
-
-# basic build option
-if(IS_IOS)
-    option(USE_OPENMP       "build with openmp support"      OFF)
-else()
-    option(USE_OPENMP       "build with openmp support"      OFF)
-endif()
-option(USE_EXCEPTION    "build with exception"           ON)
-option(WITH_LOGGING     "print logging for debug"        OFF)
-option(WITH_SYMBOL      "build with all symbols"         ON) # turn off if use jni or ios io
-option(WITH_PROFILE     "print op profile for debug"     OFF)
-option(WITH_TEST        "build with unit tests"          ON)
-
-# select platform: CPU, GPU_CL, FPGA
-option(CPU              "build with arm CPU support"     ON)
-option(GPU_CL           "build with OpenCL support"      ON)
-option(FPGA             "build with FPGA support"        OFF)
-if(FPGA)
-  option(FPGAV1     "build with fpga v1 support"   ON)
-  option(FPGAV2     "build with fpga v2 support"   OFF)
-  option(FPGAKD     "build with fpga KD support"   OFF)
-endif()
-
-project(paddle-mobile)
-
-# source code
-file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
-file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
-include_directories(src/)
-
-# build flags
-set(CMAKE_CXX_FLAGS "-O3 -s -DNDEBUG ${CMAKE_CXX_FLAGS} -Wno-attributes")
-if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
-        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
-    add_compile_options(-fembed-bitcode)
-else()
-    set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
-endif()
-
-# others
-if(USE_OPENMP)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-    add_definitions(-DPADDLE_MOBILE_USE_OPENMP)
-endif()
-
-if(WITH_LOGGING)
-    message(STATUS "Debugging mode")
-    add_definitions(-DPADDLE_MOBILE_DEBUG)
-else()
-endif()
-
-if(NOT WITH_SYMBOL)
-    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif()
-
-if(USE_EXCEPTION)
-    message(STATUS "Use exception")
-    add_definitions(-DENABLE_EXCEPTION -fexceptions)
-else()
-    add_definitions(-fno-exceptions)
-endif()
-
-if(WITH_PROFILE)
-    add_definitions(-DPADDLE_MOBILE_PROFILE)
-endif()
-
-# platform control
-if(ARM_LINUX)
-    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
-endif()
-
-if(CPU)
-    add_definitions(-DPADDLE_MOBILE_CPU)
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/arm/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
-if (GPU_CL)
-    add_definitions(-DPADDLE_MOBILE_CL)
-
-    # opencl version
-    add_definitions(-DCL_TARGET_OPENCL_VERSION=220)
-
-    if (ANDROID_ABI STREQUAL "arm64-v8a")    
-        link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL-64.so)
-    else ()
-        link_libraries(${CMAKE_CURRENT_LIST_DIR}/third_party/opencl/libOpenCL.so)
-    endif ()
-
-    include_directories(third_party/opencl/OpenCL-Headers)
-else()
-    file(GLOB_RECURSE _tmp_list src/framework/cl/*.cpp src/operators/kernel/cl/*.cpp)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/framework/cl/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
-if(FPGA)
-    file(GLOB_RECURSE _tmp_list src/operators/math/*.cpp src/operators/math/*.cc src/operators/kernel/fpga/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-    file(GLOB_RECURSE _tmp_list_h src/operators/math/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-    list(APPEND PADDLE_MOBILE_CC src/operators/math/softmax.cpp)
-    list(APPEND PADDLE_MOBILE_h src/operators/math/softmax.h)
-    list(APPEND PADDLE_MOBILE_h src/operators/math/math_func_neon.h)
-    if(FPGAV1)
-        add_definitions(-DPADDLE_MOBILE_FPGA)
-        message("FPGA_V1 enabled")
-        add_definitions(-DPADDLE_MOBILE_FPGA_V1)
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
-            src/fpga/KD/*.h src/fpga/KD/*.hpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-    endif()
-    if(FPGAV2)
-        add_definitions(-DPADDLE_MOBILE_FPGA)
-        message("FPGA_V2 enabled")
-        add_definitions(-DPADDLE_MOBILE_FPGA_V2)
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.cpp src/fpga/KD/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/KD/*.h src/operators/kernel/fpga/KD/*.hpp
-            src/fpga/KD/*.h src/fpga/KD/*.hpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-    endif()
-    if(FPGAKD)
-        message("FPGAKD enabled")
-        add_definitions(-DPADDLE_MOBILE_FPGA_KD)
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.cpp src/fpga/V1/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.cpp src/fpga/V2/*.cpp)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
-        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-        endforeach()
-
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.h)
-        foreach(f ${_tmp_list})
-            list(APPEND PADDLE_MOBILE_H ${f})
-        endforeach()
-        file(GLOB_RECURSE _tmp_list src/operators/kernel/central-arm-func/*.cpp)
-        foreach(f ${_tmp_list})
-            list(APPEND PADDLE_MOBILE_CC ${f})
-        endforeach()
-
-    endif()
-else()
-    file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/operators/kernel/fpga/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-
-
-    file(GLOB_RECURSE _tmp_list src/fpga/*.cpp src/fpga/*.cc)
-    foreach(f ${_tmp_list})
-        list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
-    endforeach()
-
-    file(GLOB_RECURSE _tmp_list_h src/fpga/*.h)
-    foreach(f ${_tmp_list_h})
-        list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
-    endforeach()
-endif()
-
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
-else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/jni/paddle_mobile_jni.cpp)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
-endif()
-
-if(IS_IOS)
-else()
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.h)
-    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/PaddleMobileCPU.mm)
-    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/io/ios_io/op_symbols.h)
-endif ()
-
-set(CMAKE_VERBOSE_MAKEFILE ON)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY build)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
-
-# NET default
-if(FPGAV1)
-    set(NET "FPGA_NET_V1" CACHE STRING "select net type")
-elseif(FPGAV2)
-    set(NET "FPGA_NET_V2" CACHE STRING "select net type")
-elseif(FPGAKD)
-    set(NET "FPGA_OPS_KD" CACHE STRING "select net type")
-else()
-    set(NET "default"     CACHE STRING "select net type")
-endif()
-
-set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGA_NET_V1" "FPGA_NET_V2" "NLP" "op")
-include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
-
-# build library
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-elseif(IS_IOS)
-    if(USE_OPENMP)
-        add_library(paddle-mobile-stage0 STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-        add_custom_target(paddle-mobile ALL
-            COMMAND libtool -static -o ${CMAKE_BINARY_DIR}/libpaddle-mobile.a ${CMAKE_CURRENT_LIST_DIR}/tools/libomp.a $<TARGET_FILE:paddle-mobile-stage0>
-            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
-            DEPENDS paddle-mobile
-        )
-        add_dependencies(paddle-mobile paddle-mobile-stage0)
-    else()
-        add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-    endif()
-else()
-  add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-endif()
-
-# unit test
-if(WITH_TEST AND WITH_SYMBOL)
-    if(IS_IOS)
-    else()
-        add_subdirectory(test)
-    endif()
-elseif(FPGA)
-    add_subdirectory(test)
-endif()
-
-# # if you want to combine third party static librares into paddle mobile so, please uncomment this code block
-# target_link_libraries(
-#     paddle-mobile
-#     -Wl,--whole-archive
-#     "path_to_third_party_static_library"
-#     -Wl,--no-whole-archive
-# )
diff --git a/mobile/CONTRIBUTING.md b/mobile/CONTRIBUTING.md
deleted file mode 100644
index faed8edf8eadb30f561f32d60b29e22119e72814..0000000000000000000000000000000000000000
--- a/mobile/CONTRIBUTING.md
+++ /dev/null
@@ -1,234 +0,0 @@
-# 贡献代码
-
-欢迎您对Paddle-Mobile项目的贡献。
-我们诚挚的感谢你的贡献，这个文档描述了我们的工作方式和工作流程。Paddle-Mobile在PaddlePaddle org下，和服务器版本的Paddle工程的代码规范基本相同，开发者也可以同时参考Paddle的相关文档。
-
-## Workflow
-
-Paddle-Mobile 开发中使用到的几种模型在这个链接下载 [点我](https://mms-mis.cdn.bcebos.com/paddle-mobile/models.zip).  
-之后是贡献代码的主要流程。
-
-### Fork
-
-* Paddle-Mobile采用Pull Request的方式提交代码，禁止直接push，所有的代码都需要人工review。首先要fork一份Paddle-Moble的代码 ["Fork" button](https://help.github.com/articles/fork-a-repo/).
-* 跳转到[Paddle-Mobile](https://github.com/PaddlePaddle/paddle-mobile) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/你的用户名/paddle-mobile>。
-
-### Clone(克隆)
-将远程仓库 clone 到本地：
-
-```bash
-➜  git clone https://github.com/你的用户名/paddle-mobile
-➜  cd Paddle
-```
-
-### 创建本地分支
-
-Paddle-Mobile 和Paddle一样，目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
-
-所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
-
-使用 `git checkout -b` 创建并切换到新分支。
-
-```bash
-➜  git checkout -b my-cool-stuff
-```
-
-值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
-
-### 使用 `pre-commit` 钩子
-
-Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
-
-`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
-
-```bash
-pip install pre-commit
-pre-commit -v -a
-```
-
-Paddle-Mobile 使用 `clang-format` 来调整 C/C++ 源代码格式，在格式化代码时不同的`clang-format`版本会有不同的表现形态，和Paddle不同的是，Paddle-Mobile开发人员使用的是更的5.0版本的llvm工具集。所以为了防止无法CI，请确保 `clang-format` 版本是 5.0 版本。
-
-> 另外：通过`pip install pre-commit`和`conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的，Paddle 开发人员使用的是`pip install pre-commit`。
-
-
-
-## 开始开发
-
-在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
-
-通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
-
-```bash
-➜  git status
-On branch test
-Changes not staged for commit:
-  (use "git add <file>..." to update what will be committed)
-  (use "git checkout -- <file>..." to discard changes in working directory)
-
-	modified:   README.md
-
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-no changes added to commit (use "git add" and/or "git commit -a")
-```
-
-## 构建
-
-paddle-mobile是为了移动端版本开发的，而移动端大多以arm平台为主。所以我们要交叉编译到arm平台。以cpu为例：
-
-1. 安装NDK最新版
-2. 配置ANDROID_NDK和NDK_ROOT环境变量
-3. 开发，并写单元测试
-4. sh build.sh
-
-## 提交（commit）
-
-接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
-
-```bash
-➜  git checkout -- README.md
-➜  git status
-On branch test
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-nothing added to commit but untracked files present (use "git add" to track)
-➜  git add test
-```
-
-Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
-
-```bash
-▶ pre-commit run -a -v
-[remove-crlf] CRLF end-lines remover........................................Passed
-[remove-tabs] Tabs remover..................................................Passed
-[check-added-large-files] Check for added large files.......................Passed
-[check-merge-conflict] Check for merge conflicts............................Passed
-[check-symlinks] Check for broken symlinks..................................Passed
-[detect-private-key] Detect Private Key.....................................Passed
-[end-of-file-fixer] Fix End of Files........................................Passed
-[trailing-whitespace] Trim Trailing Whitespace..............................Passed
-[copyright] copyright.......................................................Passed
-[clang-format] clang-format.................................................Passed
-[cpplint] cpplint...........................................................Passed
-hookid: cpplint
-
-Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
-Done processing build_bak.sh
-Ignoring build_bak.sh; not a valid file name (c, cc, h, hpp, c++, h++, cu, cpp, hxx, cxx, cuh)
-Done processing build_bak.sh
-```
-
-## 保持本地仓库最新
-
-在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/paddle-mobile>）最新的代码。
-
-首先通过 `git remote` 查看当前远程仓库的名字。
-
-```bash
-➜  git remote
-origin
-➜  git remote -v
-origin	https://github.com/USERNAME/paddle-mobile (fetch)
-origin	https://github.com/USERNAME/paddle-mobile (push)
-```
-
-这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 paddle-mobile，接下来我们创建一个原始 paddle-mobile 仓库的远程主机，命名为 upstream。
-
-```bash
-➜  git remote add upstream https://github.com/PaddlePaddle/paddle-mobile
-➜  git remote
-origin
-upstream
-```
-
-获取 upstream 的最新代码并更新当前分支。
-
-```bash
-➜  git fetch upstream
-➜  git pull upstream develop
-```
-
-## Push 到远程仓库
-
-将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/paddle-mobile。
-
-```bash
-# 推送到远程仓库 origin 的 my-cool-stuff 分支上
-➜  git push origin my-cool-stuff
-```
-
-## 建立 Issue 并完成 Pull Request
-
-建立一个 Issue 描述问题，并记录它的编号。
-
-切换到所建分支，然后点击 `New pull request`。
-
-在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue
-> 具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>
-
-
-## review
-
-在接到PR后，可以看到该pr页面内正在运行CI。如果运行出现问题，可以点Details进入Travis平台上看详细内容。
-![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833030073.jpg)
-
-可以在travis上看到更加详细的信息。
-![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294833651326.jpg)
-
-接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
-
-![](http://otkwwi4x8.bkt.clouddn.com/2018-06-20-15294877166787.jpg)
-之后就可以提交代码了
-
-## 删除远程分支
-
-在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
-
-<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
-
-也可以使用 `git push origin :分支名` 删除远程分支，如：
-
-```bash
-➜  git push origin :my-cool-stuff
-```
-
-## 删除本地分支
-
-最后，删除本地分支。
-
-```bash
-# 切换到 develop 分支
-➜  git checkout develop 
-
-# 删除 my-cool-stuff 分支
-➜  git branch -D my-cool-stuff
-```
-
-至此，我们就完成了一次代码贡献的过程。
-
-## 提交代码的一些约定
-
-为了使评审人在评审代码时更好地专注于代码本身，请您每次提交代码时，遵守以下约定：
-
-1. 请保证Travis-CI 中单元测试能顺利通过。如果没过，说明提交的代码存在问题，评审人一般不做评审。
-2. 提交Pull Request前：
-   - 请注意commit的数量：
-     - 原因：如果仅仅修改一个文件但提交了十几个commit，每个commit只做了少量的修改，这会给评审人带来很大困扰。评审人需要逐一查看每个commit才能知道做了哪些修改，且不排除commit之间的修改存在相互覆盖的情况。
-     - 建议：每次提交时，保持尽量少的commit，可以通过`git commit --amend`补充上次的commit。对已经Push到远程仓库的多个commit，可以参考[squash commits after push](http://stackoverflow.com/questions/5667884/how-to-squash-commits-in-git-after-they-have-been-pushed)。
-   - 请注意每个commit的名称：应能反映当前commit的内容，不能太随意。
-3. 如果解决了某个Issue的问题，请在该Pull Request的**第一个**评论框中加上：`fix #issue_number`，这样当该Pull Request被合并后，会自动关闭对应的Issue。关键词包括：close, closes, closed, fix, fixes, fixed, resolve, resolves, resolved，请选择合适的词汇。详细可参考[Closing issues via commit messages](https://help.github.com/articles/closing-issues-via-commit-messages)。
-
-此外，在回复评审人意见时，请您遵守以下约定：
-
-1. 评审人的每个意见都必须回复（这是开源社区的基本礼貌，别人帮了忙，应该说谢谢）：
-   - 对评审意见同意且按其修改完的，给个简单的`Done`即可；
-   - 对评审意见不同意的，请给出您自己的反驳理由。
-2. 如果评审意见比较多：
-   - 请给出总体的修改情况。
-   - 请采用[start a review](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/)进行回复，而非直接回复的方式。原因是每个回复都会发送一封邮件，会造成邮件灾难。
diff --git a/mobile/Dockerfile b/mobile/Dockerfile
deleted file mode 100644
index b9fc9ed45c032dd6198eadc9caf51c99f1a23125..0000000000000000000000000000000000000000
--- a/mobile/Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM ubuntu:16.04
-
-RUN echo '\
-deb <mirror> <version> main restricted universe multiverse\n\
-deb <mirror> <version>-updates main restricted universe multiverse\n\
-deb <mirror> <version>-backports main restricted universe multiverse\n\
-deb <mirror> <version>-security main restricted universe multiverse\n'\
-> /etc/apt/sources.list
-RUN sed -ie 's|<mirror>|http://mirrors.tuna.tsinghua.edu.cn/ubuntu/|' /etc/apt/sources.list
-RUN sed -ie 's|<version>|xenial|' /etc/apt/sources.list
-
-RUN apt-get update && apt-get upgrade -y
-RUN apt-get install -y --no-install-recommends \
-        curl \
-        unzip \
-        git \
-        make \
-        cmake-curses-gui \
-        python \
-        python-pip \
-        python-setuptools \
-        clang-format-5.0 \
-        graphviz \
-        g++-arm-linux-gnueabi \
-        gcc-arm-linux-gnueabi
-RUN apt-get autoremove -y && apt-get clean
-RUN ln -s clang-format-5.0 /usr/bin/clang-format
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --upgrade pip
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple wheel
-RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pre-commit
-RUN cd /tmp && curl -O https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip
-RUN curl -O https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
-        tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
-        mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
-        mv /usr/bin/cmake /usr/bin/cmake.bak && ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
-        mv /usr/bin/ccmake /usr/bin/ccmake.bak && ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
-RUN cd /opt && unzip /tmp/android-ndk-r17c-linux-x86_64.zip
-ENV NDK_ROOT /opt/android-ndk-r17c
diff --git a/mobile/LICENSE b/mobile/LICENSE
deleted file mode 100644
index e95626c0e457a00a4f67cbd90f27cf6377825865..0000000000000000000000000000000000000000
--- a/mobile/LICENSE
+++ /dev/null
@@ -1,204 +0,0 @@
-Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-
diff --git a/mobile/README.md b/mobile/README.md
deleted file mode 100644
index aa948a7ba7d13ee2fadd12ab903d1cee7f737dda..0000000000000000000000000000000000000000
--- a/mobile/README.md
+++ /dev/null
@@ -1,137 +0,0 @@
-# Paddle-Mobile
-
-[![Build Status](https://travis-ci.org/PaddlePaddle/paddle-mobile.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/paddle-mobile)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
-
-<!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
-[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
-
-Welcome to Paddle-Mobile GitHub project。Paddle-Mobile is a project of PaddlePaddle as well as a deep learning framework for embedded platforms.
-
-欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。
-
-## Features
-
-- high performance in support of ARM CPU 
-- support Mali GPU
-- support Andreno GPU
-- support the realization of GPU Metal on Apple devices
-- support implementation on ZU5、ZU9 and other FPGA-based development boards
-- support implementation on Raspberry Pi and other arm-linux development boards
-
-## Features
-
-- 高性能支持ARM CPU 
-- 支持Mali GPU
-- 支持Andreno GPU
-- 支持苹果设备的GPU Metal实现
-- 支持ZU5、ZU9等FPGA开发板
-- 支持树莓派等arm-linux开发板
-
-
-## Demo
-- [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
-
-### 原Domo目录
-
-[https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile/demo](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/mobile/demo)
-
-## Documentation
-
-### Documentation of design
-
-If you want to know more details about the documentation of paddle-mobile design, please refer to the link as follows. There are many previous designs and discussion: [issue](https://github.com/PaddlePaddle/Paddle-Lite/issues).
-
-[link of documentation of design](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/design_doc.md)
-
-### Documentation of development
-
-Documentation of development is mainly about building, running and other tasks.As a developer,you can use it with the help of contributed documents.
-* [iOS](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_ios.md)
-* [Android_CPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android.md)
-* [Android_GPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android_GPU.md)
-* [FPGA](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_fpga.md)
-* [ARM_LINUX](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_arm_linux.md)
-
-### How to contribute your documents
-- [tutorial link to contribute documents](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/CONTRIBUTING.md)
-- Main procedure of contributing code is covered in the document above.If you have other problems during the procedure,please send them as [issue](https://github.com/PaddlePaddle/Paddle-Lite/issues). We will deal with it as quickly as possible.
-
-## 文档
-
-### 设计文档
-
-关于paddle-mobile设计文档在下面链接中，如果想了解更多内容。[issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)中会有很多早期的设计和讨论过程。
-[设计文档链接](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/design_doc.md)
-
-### 开发文档
-
-开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
-* [iOS](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_ios.md)
-* [Android_CPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android.md)
-* [Android_GPU](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_android_GPU.md)
-* [FPGA](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_fpga.md)
-* [ARM_LINUX](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/doc/development_arm_linux.md)
-
-### 贡献文档
-- [贡献文档链接](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/CONTRIBUTING.md)
-- 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/Paddle-Lite/issues)。我们看到后会尽快处理。
-
-## Acquision of Models
-At present Paddle-Mobile only supports Paddle fluid training model. Models wiil be operated regularly after transformation if you have various models.
-### 1. Use Paddle Fluid directly to train
-It is the most reliable method to be recommanded
-### 2. Transform Caffe to Paddle Fluid model
-[caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/caffe2fluid)
-### 3. ONNX
-ONNX is expanded as Open Neural Network Exchange. The project is aimed to make a full communication and usage among diffrent nerual network development frameworks.
-
-Except for directly using fluid models trained by PaddlePaddle,you can also get certain Paddle fluid models through onnx transformation.
-
-At present，work in support of onnx is also under operation in Baidu. Related tranformation project can be referred to here：
-[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
-
-### 4. Download parts of testing models and testing pictures
-[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
-
-- input data generated by tools from `tools/python/imagetools`.
-
-
-## 模型获得
-目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
-### 1. 直接使用Paddle Fluid训练
-该方式最为可靠，推荐方式
-### 2. caffe转为Paddle Fluid模型
-[caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/caffe2fluid)
-### 3. ONNX
-ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切换”。该项目的目的是让不同的神经网络开发框架做到互通互用。
-
-除直接使用PaddlePaddle训练fluid版本的模型外，还可以通过onnx转换得到个别Paddle fluid模型。
-
-目前，百度也在做onnx支持工作。相关转换项目在这里：
-[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
-
-### 4. 部分测试模型和测试图片下载
-[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
-
-- 测试输入数据可由本仓库下的脚本`tools/python/imagetools`生成。
-
-## Communication
-- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc.
-- QQ discussion group: 696965088 (Paddle-Mobile).
-- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc.
-
-## 交流与反馈
-- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
-- QQ群: 696965088 (Paddle-Mobile)
-- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
-
-## Old version Mobile-Deep-Learning
-Original MDL(Mobile-Deep-Learning) project has been transferred to [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
-
-## 旧版 Mobile-Deep-Learning
-原MDL(Mobile-Deep-Learning)工程被迁移到了这里 [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning)
-
-## Copyright and License
-[Apache-2.0 license](LICENSE).
diff --git a/mobile/benchmark/arm_benchmark.md b/mobile/benchmark/arm_benchmark.md
deleted file mode 100644
index aacbf3ef0563edc98d5bf323e3df32e36d275cdf..0000000000000000000000000000000000000000
--- a/mobile/benchmark/arm_benchmark.md
+++ /dev/null
@@ -1,36 +0,0 @@
-|mobilenet arm v7|1线程|2线程|4线程|
-|------------|----|-----|-----|
-|麒麟970(ms)|108.180|63.935|37.545|
-|麒麟960(ms)|108.588|63.073|36.822|
-|高通845(ms)|85.952|48.890|28.641|
-|高通835(ms)|105.434|62.752|37.131|
-|||||
-|mobilenetssd arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|212.686|127.205|77.485|
-|麒麟960(ms)|212.641|125.338|75.250|
-|高通845(ms)|182.863|95.671|56.857|
-|高通835(ms)|213.849|127.717|77.006|
-|||||
-|googlenet(v1) arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|335.288|234.559|161.295|
-|麒麟960(ms)|354.443|232.642|157.815|
-|高通845(ms)|282.007|173.146|122.148|
-|高通835(ms)|341.250|233.354|158.554|
-|||||
-|squeezenet arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|83.726|57.944|36.923|
-|麒麟960(ms)|85.835|55.762|36.496|
-|高通845(ms)|71.301|41.618|28.785|
-|高通835(ms)|82.407|56.176|36.455|
-|||||
-|yolo arm v7|1线程|2线程|4线程|
-|麒麟970(ms)|129.658|79.993|49.969|
-|麒麟960(ms)|130.208|78.791|48.390|
-|高通845(ms)|109.244|61.736|40.600|
-|高通835(ms)|130.402|80.863|50.359|
-
-    测试机型信息：
-    麒麟970:荣耀v10     (2.36GHz * 4 + 1.8GHz * 4)
-    麒麟960:华为mate9   (2.36GHz * 4 + 1.8GHz * 4)
-    骁龙835:小米6       (2.45GHz * 4 + 1.9GHz * 4)
-    骁龙845:OPPO FindX  (2.80GHz * 4 + 1.8GHz * 4)
diff --git a/mobile/benchmark/metal_benchmark.md b/mobile/benchmark/metal_benchmark.md
deleted file mode 100644
index 2ffa7a00af95ef4c5ff0fb503e7427f245e09428..0000000000000000000000000000000000000000
--- a/mobile/benchmark/metal_benchmark.md
+++ /dev/null
@@ -1,10 +0,0 @@
-|mobilenetfssd|速度|
-|------------|-----|
-|A9(ms)|33.78|
-|A10(ms)|24.05|
-|A11(ms)|17.15|
-|||
-|genet|速度|
-|A9(ms) |3.49|
-|A10(ms)|2.54|
-|A11(ms)|1.43|
diff --git a/mobile/demo/ReadMe.md b/mobile/demo/ReadMe.md
deleted file mode 100644
index c6d7b3def9fb44db86ea4456396c91354953d99d..0000000000000000000000000000000000000000
--- a/mobile/demo/ReadMe.md
+++ /dev/null
@@ -1,10 +0,0 @@
-## Demo 下载路径
-- [ANDROID](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip)
-
-- [IOS](http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip)
-
-- 原demo亦可使用getDemo.sh进行下载
-
-```
-sh getDemo.sh
-```
diff --git a/mobile/demo/getDemo.sh b/mobile/demo/getDemo.sh
deleted file mode 100644
index 37662a2f4ed65020a479089fcceffd0a62798134..0000000000000000000000000000000000000000
--- a/mobile/demo/getDemo.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/usr/bin/env bash
-wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobile_Android.zip
-wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FPaddleMobileDemo_iOS.zip
-unzip paddle-mobile%2FPaddleMobile_Android.zip
-unzip paddle-mobile%2FPaddleMobileDemo_iOS.zip
-rm -rf paddle-mobile%2FPaddleMobile_Android.zip
-rm -rf paddle-mobile%2FPaddleMobileDemo_iOS.zip
-rm -rf __MACOSX
diff --git a/mobile/doc/build.md b/mobile/doc/build.md
deleted file mode 100644
index 0aaaccd03129d07a73c6ff37ba660b054b72c576..0000000000000000000000000000000000000000
--- a/mobile/doc/build.md
+++ /dev/null
@@ -1,63 +0,0 @@
-# 环境搭建
-## 使用 docker
-### 1. 安装 docker
-安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
-### 2. 使用 docker 搭建构建环境
-首先进入 paddle-mobile 的目录下，执行 `docker build`
-以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
-```
-$ docker build -t paddle-mobile:dev - < Dockerfile
-```
-使用 `docker images` 可以看到我们新建的 image
-```
-$ docker images
-REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
-paddle-mobile   dev     33b146787711   45 hours ago    372MB
-```
-### 3. 使用 docker 构建
-进入 paddle-mobile 目录，执行 docker run
-```
-$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
-root@5affd29d4fc5:/ # cd /paddle-mobile
-###
-### paddle-mobile 支持 arm 架构下的各种平台，包括 android 以及 linux 等，可以使用不同的
-### toolchain 文件生成满足需要的 makefile
-###
-# 生成构建 android 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
-
-# 生成构建 linux 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
-```
-### 4. 设置编译选项
-可以通过 ccmake 设置编译选项
-```
-root@5affd29d4fc5:/ # ccmake .
-                                                     Page 1 of 1
- CMAKE_ASM_FLAGS
- CMAKE_ASM_FLAGS_DEBUG
- CMAKE_ASM_FLAGS_RELEASE
- CMAKE_BUILD_TYPE
- CMAKE_INSTALL_PREFIX             /usr/local
- CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
- CPU                              ON
- DEBUGING                         ON
- FPGA                             OFF
- LOG_PROFILE                      ON
- NET                              googlenet
- USE_EXCEPTION                    ON
- USE_OPENMP                       OFF
-```
-修改选项后，按 `c`, `g` 更新 Makefile
-### 5. 构建
-使用 make 命令进行构建
-```
-root@5affd29d4fc5:/ # make
-```
-### 6. 查看构建产出
-构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及 test/build 下，可以使用 adb 指令或者 scp 传输到 device 上执行
-
-## 不使用 docker
-不使用 docker 的方法，可以直接用 cmake 生成 makefile 后构建。使用 ndk 构建 android 应用需要正确设置 NDK_ROOT。构建 linux 应用需要安装 arm-linux-gnueabi-gcc 或者类似的交叉编译工具，可能需要设置 CC，CXX 环境变量，或者在 tools/toolchains/ 中修改 arm-linux-gnueabi.cmake，或者增加自己需要的 toolchain file。
diff --git a/mobile/doc/design_doc.md b/mobile/doc/design_doc.md
deleted file mode 100644
index 1e23efd52c846e1220c04e9cc15e1e786694ce37..0000000000000000000000000000000000000000
--- a/mobile/doc/design_doc.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# paddle-mobile 设计文档
-
-
-#### 以下是 paddle-mobile 代码的执行流程图:
-
-![执行流程图](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/flow_chart.png)
-
-
-#### 主要分为: Loader 模块、 Program 模块、 Executor 模块、 op 模块、 kernel 模块、scope variable Tensor 模块
-
-#### 下面展开说一下各个模块的作用以及设计思路
-
-### 一. Loader
-先来看一下模型, 模型分为两种结构:
- 一种为参数文件是散开的, 如下图, 红框为模型结构的 protobuf 文件, 其余为参数文件
-
-![模型描述](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc.png)
-
-
-另一种为参数文件结合在一起的, 如下图, 红框内为模型结构描述的 protobuf 文件, 另一个文件为结合在一起的参数文件
-
-![模型描述combined](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/model_desc_combined.png)
-
-
-loader 模块的作用是将模型结构信息 load 进内存, 将红框内的 protobuf 文件 load 进内存, 并对模型结构进行优化(如将几个细粒度的 op 融合成 粗粒度的 op, 如将 conv、 add、 batchnorm、 relu 融合为 conv\_add\_batchnorm\_relu).
-方便进行算法优化.
-
-__那么为什么融合在一起能够做算法优化 ?__
-
-如果未融合的 conv add batchnorm relu 运算是这样的
-
-```
-[n]
-[conv_res] = conv([n])
-
-for &res in conv_res {
-	res = add_biase(res)
-}
-
-for &res in conv_res {
-	res = batchnorm(res)
-}
-
-for &res in conv_res {
-	res = relu(res)
-}
-
-```
-融合后的 conv\_add\_batchnorm\_relu 运算是这样的:
-
-```
-[n]
-[conv_res] = conv([n])
-
-for &res in conv_res {
-	res = relu(batchnorm(add_biase(res)))
-}
-
-```
-由于 conv 可以转换为两个大矩阵相乘, 更进一步可以分为若干个一行一列的小矩阵相乘, 那最终的运算是这样的:
-
-```
-[n]
-for &res in [res] {
-	res = relu(batchnorm(add_biase(A * B)))
-}
-
-其中 A 和 B 为 1 * k 和 k * 1 矩阵
-
-```
-
-
-
-### 二. Program
-
-program 为 loader 模块的结果, 包含了优化前的模型结构对象, 以及优化后的模型结构对象, 此模块基本对应着 paddle 模型的结构, 关于paddle 模型的一些概念的定义, 详细设计可以参考 [program.md](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/concepts/program.md), 以下是一个简单的概况: 
-
-* programDesc 中包含着若干个(googlenet mobilenet yolo squeezenet resnet 常见的模型只有一个)可以嵌套的 block, blocks中的第一个block中的某个 op 可能会执行 blocks 中后边 block 中的一系列 op 运算(只有多个block才会有此概念)
-* block 包含着 ops 和 vars
-* ops 为一系列 op 的描述, 描述着每个 op 的类型, 输入输出, 所需参数
-* vars 里包含的为所有 op 运算所需的参数描述
-
-### 三. Executor
-
-executor 主要是用于 op 运算的上层调度操作, 主要有两个操作,  executor 实例化 和 暴露给上层的 predict 方法
-
-* executor 实例化过程中, 主要进行了这几个操作 
-	1. 根据 loader 产出的 program 初始化 operator 对象 
-	2. 分配所有需要用到的内存, 包括每个op 的输入输出, 权重参数, 目前模型的权重参数文件的内存格式为 NCHW, op 的输入输出中间矩阵参数也是 NCHW 格式
-	3. 调用每个 op 的 init 方法, init 方法是每个 op 实现者进行参数预处理的地方, 有助于减少 predict 的耗时
-
-* predict, 主要用于拿到外部的输入, 顺序调用 op 的 run 方法进行运算, 并返回最终的结果.
-
-
-### 四. op
-关于 op 模块代码的详细设计可以参考 [operator部分代码设计](https://github.com/PaddlePaddle/paddle-mobile/issues/300), operator主要包含一个kernel用于运算、一个 param 用于存储属性, operator 主要有三个操作, Init、RunImp、InferShape
-
-* Init: Init 函数主要用于参数预处理, 如对 batchNorm 参数进行预处理, 可以将 batchNorm 运算转化为 a * x + b 形式的运算, 这个函数也会调用, kernel 的 Init 函数对 kernel 进行初始化
-* RunImp: RunImp 函数会调用自己的kernel 的 compute 方法进行运算
-* InferShape: InferShape 函数会根据输入和参数得出输出的形状, 这个函数会在 executor 实例化时, 内存初始化前调用
-
-每个 operator 都需要进行注册才可以被使用, 以 conv 为例, 需在 conv_op.cpp 底部这样写: 
-
-```c++
-// 三个平台都注册了 conv op
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
-REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
-REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
-#endif
-
-```
-
-__一个关于包大小的优化__:
-
-每个 operator 都由一个宏控制编译, 如 conv_op.h(除了 conv_op.h ,  conv_op.cpp、conv_kernle.h、conv_kernle.cpp 也都需要加此宏控制)
-
-```c++
-
-#ifdef CONV_OP    //这个宏控制着 conv_op 是否被编译, 除了 conv_op.h ,  conv_op.cpp、conv_kernle.h conv_kernle.cpp 也都需要加此宏控制
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ConvOp
-	//impl  
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
-
-```
-这样做的目的是为了根据不同类型的网络编译特定的op, 在 cmake 中已经配置好不同网络编译的宏, 如果你要进行编译支持 yolo 的模型, 仅需执行:
-
-```sh
-cd toools
-sh build.sh android yolo
-
-```
-这样只会编译 yolo 所包含的四种 op, 极大的减小了包体积和编译时间
-
-### 五. kernel
-kernel 为 op 的底层运算实现, 主要有两个函数, Init 和 Compute, 分别用来初始化、预处理 和 运算操作, 值得提出的是, kernel 会根据泛型特化到不同的平台, 如图所示:
-
-![设备特化](http://mms-graph.bj.bcebos.com/paddle-mobile/git_images/devices.png)
-
-不同平台的 kernel 实现, 为同一个 kernel 类不同泛型的特化实现, 目前有三个平台, arm、mali、fpga, 图中的 central-arm-func\ 目录为 op kernel 的 arm 实现, 它承担了 arm\ 目录下 kernel 的底层实现, 同时 arm 处理器作为中央处理器, central-arm-func\ 也可以作为其他协处理器的底层实现, 如: fpga 的某一个 op kernel 还没有 fpga 协处理器的实现, 就可以直接调用使用这里的 arm 实现.
-
-__如果你有兴趣新增一个协处理器实现, 就可以在次添加一个 kernel 目录, 提供协处理器实现, 如果某个 kernel 你没有实现完, 你也可以直接使用 arm 实现__
-
-### 六. scope variable Tensor
-* scope 用来存储管理所需用到的所有 variable(用来存储不同类型的对象, 主要是矩阵Tensor, 也就是说 scpoe 管理着 op 运算过程中所有参数矩阵, 输入输出矩阵), 可以将 scope 理解为一个 map, 这里在 map 上封了一层 scope 的概念是为了方便内存管理
-* variable 可以用来存储不同类型的对象, paddle-mobile 里主要用它来存储矩阵 Tensor
-* tensor 代表着矩阵, 通过泛型可以用来存储不同类型的矩阵, 但需要注意的是, 存入和取出时的类型必须保持一致, 如果类型不一致,  使用 inline const T \*data() const 获取指针会不能通过类型检查, 通过  inline T \*mutable_data() 获取指针会重新分配内存, 以下是关于 Tensor 的一些小概念:
-	1. DDim: 用来存储矩阵的维度信息.
-	2. Slice(): 这个函数用来获取 N 维 (NCHW中的 N) 上切片
-	3. 当实例化未分配内存时, 调用 inline T *mutable_data() 会分配内存
diff --git a/mobile/doc/development_android.md b/mobile/doc/development_android.md
deleted file mode 100644
index c7574eb55e6093eeaf946756bf132ad0e505a873..0000000000000000000000000000000000000000
--- a/mobile/doc/development_android.md
+++ /dev/null
@@ -1,189 +0,0 @@
-# Android开发文档
-
-用户可通过如下两种方式进行编译:
-
-- 基于macOS 、Linux交叉编译
-- 基于Docker容器编译
-
-## 基于macOS 、Linux交叉编译
-
-需要: NDK17及以上、cmake 3.0及以上
-
-### 执行编译
-
-在paddle-mobile根目录中，执行以下命令：
-
-```shell
-
-cd tools
-sh build.sh android
-
-# 如果想编译只支持某些特定网络的库 (可以控制包体积, 编译出来的库就只包含了支持这些特定模型的算子), 可以使用
-
-sh build.sh android  mobilenet googlenet
-
-# 当然这些网络是需要在 cmakelist  中配置的(https://github.com/PaddlePaddle/paddle-mobile/blob/73769e7d05ef4820a115ad3fb9b1ca3f55179d03/CMakeLists.txt#L216), 目前配置了几个常见模型
-
-```
-
-执行完毕后，生成的`so`位于`build/release/`目录中：  
-
-- jni 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/src/io/jni)  
-- c++ 头文件位于 [https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/src/io/paddle_inference_api.h)   
-
-单测可执行文件位于`test/build`目录中。
-
-如果有环境问题, 可以看接下来的环节
-
-### 环境配置
-
-##### 下载Android NDK
-
-如果你的电脑安装了Android Studio, 可以在 Android Studio 中直接下载安装`NDK`或者可以在 [https://developer.android.com/ndk/](https://developer.android.com/ndk/) 这里自行下载，也可以通过以下命令获取：
-
-- Mac平台
-
-```shell
-wget https://dl.google.com/android/repository/android-ndk-r17b-darwin-x86_64.zip
-unzip android-ndk-r17b-darwin-x86_64.zip
-```
-
-- Linux平台
-
-```shell
-wget https://dl.google.com/android/repository/android-ndk-r17b-linux-x86_64.zip
-unzip android-ndk-r17b-linux-x86_64.zip
-```
-
-##### 设置环境变量
-工程中自带的独立工具链会根据环境变量`NDK_ROOT`查找NDK，因此需要配置环境变量：
-
-```shell
-export NDK_ROOT = "path to ndk"
-```
-
-##### 安装 CMake
-
-- Mac平台
-
-mac 平台下可以使用`homebrew`安装
-
-```shell
-brew install cmake
-```
-
-- Linux平台
-
-linux 下可以使用`apt-get`进行安装
-
-```shell
-apt-get install cmake
-
-```
-
-##### Tips:
-如果想要获得体积更小的库，可选择编译支持指定模型结构的库。
-如执行如下命令：
-
-```shell
-sh build.sh android googlenet
-```
-
-会得到一个支持googlnet的体积更小的库。
-
-## 基于Docker容器编译
-
-### 1. 安装 docker
-
-安装 docker 的方式，参考官方文档 [https://docs.docker.com/install/](https://docs.docker.com/install/)
-
-### 2. 使用 docker 搭建构建环境
-
-首先进入 paddle-mobile 的目录下，执行 `docker build`
-以 Linux/Mac 为例 (windows 建议在 'Docker Quickstart Terminal' 中执行)
-
-```shell
-$ docker build -t paddle-mobile:dev - < Dockerfile
-```
-使用 `docker images` 可以看到我们新建的 image
-
-```shell
-$ docker images
-REPOSITORY      TAG     IMAGE ID       CREATED         SIZE
-paddle-mobile   dev     33b146787711   45 hours ago    372MB
-```
-### 3. 使用 docker 构建
-进入 paddle-mobile 目录，执行 docker run
-
-```shell
-$ docker run -it --mount type=bind,source=$PWD,target=/paddle-mobile paddle-mobile:dev
-root@5affd29d4fc5:/ # cd /paddle-mobile
-# 生成构建 android 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-android-neon.cmake
-# 生成构建 linux 产出的 Makefile
-root@5affd29d4fc5:/ # rm CMakeCache.txt
-root@5affd29d4fc5:/ # cmake -DCMAKE_TOOLCHAIN_FILE=tools/toolchains/arm-linux-gnueabi.cmake
-```
-### 4. 设置编译选项
-
-可以通过 ccmake 设置编译选项
-
-```
-root@5affd29d4fc5:/ # ccmake .
-                                                     Page 1 of 1
- CMAKE_ASM_FLAGS
- CMAKE_ASM_FLAGS_DEBUG
- CMAKE_ASM_FLAGS_RELEASE
- CMAKE_BUILD_TYPE
- CMAKE_INSTALL_PREFIX             /usr/local
- CMAKE_TOOLCHAIN_FILE             /paddle-mobile/tools/toolchains/arm-android-neon.cmake
- CPU                              ON
- DEBUGING                         ON
- FPGA                             OFF
- LOG_PROFILE                      ON
- MALI_GPU                         OFF
- NET                              googlenet
- USE_EXCEPTION                    ON
- USE_OPENMP                       OFF
-```
-修改选项后，按 `c`, `g` 更新 Makefile
-### 5. 构建
-使用 make 命令进行构建
-
-```
-root@5affd29d4fc5:/ # make
-```
-### 6. 查看构建产出
-
-构架产出可以在 host 机器上查看，在 paddle-mobile 的目录下，build 以及`test/build`下，可以使用`adb`指令或`scp`传输到`device`上执行
-
-## 测试
-
-在编译完成后，我们提供了自动化的测试脚本，帮助用户将运行单测文件所需要的模型及库文件push到Android设备
-
-执行下面的脚本，该脚本会下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip)，在项目下的`test`目录创建模型和图片文件夹，并将`mobilenet`复制到`paddle-mobile/test/models`目录下，将`test_image_1x3x224x224_float`复制到`paddle-mobile/test/images`目录下
-
-
-```shell
-cd tools
-sh ./prepare_images_and_models.sh
-```
-
-* 执行下面命令将可执行文件和预测需要的文件部署到手机
-
-```shell
-cd tools/android-debug-script
-sh push2android.sh
-```
-
-* mobilenet cpu模型预测结果
-
-假设mobilenet和`test_image_1x3x224x224_float`文件已经推送到手机上，执行下面命令进行mobilenet cpu的预测
-
-```shell
-adb shell
-cd /data/local/tmp/bin/
-export LD_LIBRARY_PATH=.
-./test-mobilenet
-```
diff --git a/mobile/doc/development_android_GPU.md b/mobile/doc/development_android_GPU.md
deleted file mode 100644
index a3fb7dd1dd10cbe10803448be9772770e7c4d033..0000000000000000000000000000000000000000
--- a/mobile/doc/development_android_GPU.md
+++ /dev/null
@@ -1,77 +0,0 @@
-## paddle-mobile GPU开发文档
-
-编译环境配置方法请参考`development_android.md`文档
-
-1. 下载 paddle-mobile
-
-```shell
-git clone https://github.com/PaddlePaddle/paddle-mobile.git
-
-adb pull /system/vendor/lib/libOpenCL.so paddle-mobile/third_party/opencl
-
-# 修改paddle-mobile/CMakeLists.txt文件，执行如下操作:
-# option(GPU_CL "opencl gpu" OFF)->option(GPU_CL "opencl gpu" ON)
-
-cd paddle-mobile/tools
-sh build.sh android
-```
-
-2. 将单测可执行文件和模型部署到手机
-
-执行下面的脚本，该脚本会下载测试需要的 [mobilenet和test_image_1x3x224x224_float(预处理过的 NCHW 文件) 文件](http://mms-graph.bj.bcebos.com/paddle-mobile/opencl_test_src.zip)，在项目下的`test`目录创建模型>和图片文件夹，并将`mobilenet`复制到`paddle-mobile/test/models`目录下，将`test_image_1x3x224x224_float`复制到`paddle-mobile/test/images`目录下
-
-```shell
-cd tools
-sh ./prepare_images_and_models.sh
-```
-
-执行下面命令将可执行文件和预测需要的文件部署到手机
-
-```shell
-cd ../tools/android-debug-script
-sh push2android.sh
-```
-
-3. 在`adb shell`中执行对应的可执行文件（目前只支持mobilenet，后续会支持更多的网络模型）
-
-```shell
-adb shell
-cd /data/local/tmp/bin/
-export LD_LIBRARY_PATH=.
-./test-mobilenetgpu
-```
-
-4. mobilenet cpu模型预测结果
-
-执行下面命令进行mobilenet cpu的预测
-
-```shell
-adb shell
-cd /data/local/tmp/bin/
-export LD_LIBRARY_PATH=.
-./test-mobilenet
-```
-
-5. 预测结果
-
-  手机型号：小米6(CPU 835,GPU Adreno 540)
-
-  mobilenet gpu：预测性能，耗时41ms左右。
-
-  mobilenet cpu:
-
-  1线程：108ms
-  2线程：65ms
-  4线程：38ms
-
-  手机型号：OPPO Findx(CPU 845,GPU Adreno 630)
-
-  mobilenet gpu：预测性能，耗时27ms左右。
-
-  mobilenet cpu:
-
-  1线程：90ms
-  2线程：50ms
-  4线程：29ms
-  
- 备注: GPU 在打开log之后, 会大幅增加性能开销,测试benchmark请关闭CmakeList中Log选项
diff --git a/mobile/doc/development_arm_linux.md b/mobile/doc/development_arm_linux.md
deleted file mode 100644
index bdabd04223f86f9389dc207515ad281f2d80e90e..0000000000000000000000000000000000000000
--- a/mobile/doc/development_arm_linux.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# ARM Linux开发文档
-
-在ARM Linux如Raspberrypi3，或Firefly-RK3399上编译paddle-mobile（**注：暂不支持ARM Linux GPU**）。
-
-## 预先安装
-
-```shell
-$ sudo apt update
-$ sudo apt-get install -y cmake git
-$ git clone https://github.com/PaddlePaddle/paddle-mobile.git
-```
-
-## 编译
-
-在paddle-mobile根目录中，执行以下命令：
-
-```shell
-# 进入paddle-mobile根目录
-$ cd <your-paddle-mobile>
-
-# 可选：开启GPU支持，在CMakeLists.txt开启GPU_CL选项为ON
-$ cp /usr/lib/aarch64-linux-gnu/libMali.so ./third_party/opencl/
-$ cp /usr/lib/aarch64-linux-gnu/libOpenCL.so ./third_party/opencl/
-$ ln -s ./third_party/opencl/libMali.so ./third_party/opencl/
-
-# 编译
-$ cd ./tools
-$ /bin/bash build.sh arm_linux
-```
-
-- 动态库`so`文件位于`<paddle-mobile-repo>/build/release/arm-linux/build`目录；  
-- 单元测试位于`<paddle-model-repo>/test/build`目录，若只编译如`googlenet`，可以执行`bash build.sh arm_linux googlenet`。
-
-## 运行
-
-接着刚刚的命令，执行MobileNet模型：
-
-```shell
-# 导入编译好的动态库路径到LD_LIBRARY_PATH中
-$ cd ../build/release/arm-linux/build
-$ export LD_LIBRARY_PATH=.
-
-# 执行MobileNet
-# 可选：GPU执行./test-mobilenetgpu
-$ cd ../../../../test/build/
-$ ./test-mobilenet
-
-# 执行顺利会打印如下日志
-load cost :0ms
- Max element is 0.985921 at position 954
-predict cost :121.462ms
-如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana 是否存在?
-```
-
-注意：  
-1. 如果本地仓库中`test`目录下没有模型，脚本会自动下载官方demo模型并解压；  
-2. 因为ARM Linux设备算力限制，编译卡死重启机器尝试单线程编译（修改`tools/build.sh`中`build_for_arm_linux`的编译为`make -j`），或指定编译某个模型（如googlenet）或扩大系统的swap交换空间。
-
-## 其它
-
-- 若编译中提示有不识别的编译选项等ARM Linux平台的编译问题，可尝试修改`tools/build.sh`中的相关编译参数；  
-- Android平台请参考Android开发文档.
diff --git a/mobile/doc/development_fpga.md b/mobile/doc/development_fpga.md
deleted file mode 100644
index 4019739b4578e5153729015aed2eb8fd06ec6335..0000000000000000000000000000000000000000
--- a/mobile/doc/development_fpga.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# FPGA开发文档
-
-FPGA平台的代码分为V1和V2。要复现V1运行的结果，需要准备专门的硬件、底层驱动程序、FPGA工程。这些都在之前的版本[1.1.1](https://github.com/PaddlePaddle/paddle-mobile/releases/tag/1.1.1) 中提供了链接。根据链接的使用说明，可以复现resnet50的推测结果。
-
-后续PaddleMobile版本，不再提供相关的辅助文件。
diff --git a/mobile/doc/development_ios.md b/mobile/doc/development_ios.md
deleted file mode 100644
index 1dbc7555e8ed6db94071c571673212d0ce2b7a71..0000000000000000000000000000000000000000
--- a/mobile/doc/development_ios.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# iOS开发文档
-
-## CPU
-
-需要: xcode
-
-### 编译
-
-```sh
-
-# 在 paddle-mobile 目录下:
-cd tools
-
-sh build.sh ios
-
-# 如果只想编译某个特定模型的 op, 则需执行以下命令
-sh build.sh ios googlenet
-
-# 在这个文件夹下, 你可以拿到生成的 .a 库
-cd ../build/release/ios/build
-
-```
-#### 常见问题:
-
-1. No iOS SDK's found in default search path ...
-
-    这个问题是因为 tools/ios-cmake/ios.toolchain.cmake 找不到你最近使用的 iOS SDK 路径, 所以需要自己进行指定, 
-    以我当前的环境为例: 在 tools/ios-cmake/ios.toolchain.cmake 143行前添加我本地的 iOS SDK 路径: set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-
-### 集成
-
-```
-将上一步生成的:
-libpaddle-mobile.a
-
-/src/ios_io/ 下的
-PaddleMobileCPU.h
-```
-拖入工程
-
-#### oc 接口
-
-接口如下:
-
-```
-/*
-	创建对象
-*/
-- (instancetype)init;
-
-/*
-	load 模型, 开辟内存
-*/
-- (BOOL)load:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/*
-	进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/*
-	进行预测
-*/
-- (NSArray *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-/*
-	清理内存
-*/
-- (void)clear;
-
-```
-
-## GPU
-
-需要: xcode、cocoapods  
-
-```
-# 在 paddle-mobile 目录下:
-cd metal
-
-pod install
-
-open paddle-mobile.xcworkspace
-
-```
diff --git a/mobile/doc/quantification.md b/mobile/doc/quantification.md
deleted file mode 100644
index 4e851581ae75e00d823b084238f63c2dba7d5d5a..0000000000000000000000000000000000000000
--- a/mobile/doc/quantification.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Quantification 模型量化、反量化
-
-## 背景故事
-部分网络如AlexNet训练出的模型体积较大，不适宜在移动设备上使用。
-
-
-## 解决模型过大办法
-1. 选用适合移动端的模型结构如：mobilenet、googlenet、 yolo、squeezenet 等；
-2. 使用我们提供的量化工具，可以在几乎不影响精度的情况下将float32模型减小至原模型的 1/4；
-
-- - - - - 
-## 量化工具介绍
-
-### 模型转化工具目录：
-
-- [量化工具目录](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/tools/quantification)
-
-- [模型转化工具](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/convert.cpp)
-
-#### 使用说明
-- [工具使用](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/tools/quantification/README.md)
-
-## 如何读取量化后的模型
-load方法中添加了 quantification 参数，默认为false。 如果需要load量化后的模型，按需传参即可。
-
-[我是源代码](https://github.com/PaddlePaddle/paddle-mobile/blob/55302b33ea3bd68c9797d8f65e527544792b8095/src/io/paddle_mobile.h)
-
-```c++
-bool Load(const std::string &dirname, bool optimize = false,
-            bool quantification = false, int batch_size = 1);
-```
-
-- - - - - 
diff --git a/mobile/src/common/common.h b/mobile/src/common/common.h
deleted file mode 100644
index c7a681f426f788bcd8ee8f52dbfab3c6e1afeb8f..0000000000000000000000000000000000000000
--- a/mobile/src/common/common.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <chrono>  // NOLINT
-
-namespace paddle_mobile {
-
-using Time = decltype(std::chrono::high_resolution_clock::now());
-
-inline Time time() { return std::chrono::high_resolution_clock::now(); }
-
-inline double time_diff(Time t1, Time t2) {
-  typedef std::chrono::microseconds ms;
-  auto diff = t2 - t1;
-  ms counter = std::chrono::duration_cast<ms>(diff);
-  return counter.count() / 1000.0;
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/enforce.h b/mobile/src/common/enforce.h
deleted file mode 100644
index 9cabee989b072bfaa6339e12b152a3d5cf343782..0000000000000000000000000000000000000000
--- a/mobile/src/common/enforce.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef ENABLE_EXCEPTION
-#include <stdio.h>
-#include <stdlib.h>
-#include <exception>
-#include <string>
-#endif
-
-namespace paddle_mobile {
-
-#ifdef ENABLE_EXCEPTION
-struct PaddleMobileException : public std::exception {
-  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
-  std::string message;
-
-  PaddleMobileException(const char *header, const char *detail,
-                        const char *file, const int line) {
-    char buffer[1500];
-    snprintf(buffer, sizeof(buffer),
-             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
-             exception_prefix.c_str(), header, file, line, detail);
-    message = std::string(buffer);
-  }
-  const char *what() const noexcept { return message.c_str(); }
-};
-
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
-  {                                                                        \
-    char buffer[1000];                                                     \
-    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
-    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
-                                               __FILE__, __LINE__);        \
-  }                                                                        \
-  exit(0);
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
-  {                                                                           \
-    if (stat) {                                                               \
-    } else {                                                                  \
-      char buffer[1000];                                                      \
-      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
-      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
-                                                 buffer, __FILE__, __LINE__); \
-    }                                                                         \
-  }
-#else
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...) \
-  {                                      \
-    if (stat) {                          \
-    } else {                             \
-    }                                    \
-  }
-
-#endif
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h
deleted file mode 100644
index 3b42188b62278c0acde41d52d68cc4b48ee6cda9..0000000000000000000000000000000000000000
--- a/mobile/src/common/log.h
+++ /dev/null
@@ -1,283 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#ifdef PADDLE_MOBILE_DEBUG
-#include <cstring>
-#include <iostream>
-#include <sstream>
-#include <string>
-#endif
-#ifdef ANDROID
-#include <android/log.h>
-#endif
-
-namespace paddle_mobile {
-
-#ifdef PADDLE_MOBILE_DEBUG
-
-#ifdef ANDROID
-
-static const char *ANDROID_LOG_TAG =
-    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
-#ifdef PADDLE_ENABLE_COLORABLE_LOG
-#define PADDLE_RED "\033[1;31;40m"
-#define PADDLE_GREEN "\033[1;32;40m"
-#define PADDLE_YELLOW "\033[1;33;40m"
-#define PADDLE_LIGHT_RED "\033[1;35;40m"
-#define PADDLE_BLUE "\033[1;34;40m"
-#define PADDLE_WHITE "\033[1;37;40m"
-#define PADDLE_CONON "\033[0m"
-#else
-#define PADDLE_RED ""
-#define PADDLE_GREEN ""
-#define PADDLE_YELLOW ""
-#define PADDLE_LIGHT_RED ""
-#define PADDLE_BLUE ""
-#define PADDLE_WHITE ""
-#define PADDLE_CONON ""
-#endif
-#define ANDROIDLOGI(...)                                               \
-  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, PADDLE_YELLOW "%s\n" PADDLE_CONON, __VA_ARGS__);     \
-  fflush(stderr)
-#define ANDROIDLOGW(...)                                               \
-  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, PADDLE_LIGHT_RED "%s\n" PADDLE_CONON, __VA_ARGS__);  \
-  fflush(stderr)
-#define ANDROIDLOGD(...)                                                \
-  __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, PADDLE_WHITE "%s\n" PADDLE_CONON, __VA_ARGS__);       \
-  fflush(stderr)
-#define ANDROIDLOGE(...)                                                \
-  __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, PADDLE_RED "%s\n" PADDLE_CONON, __VA_ARGS__);         \
-  fflush(stderr)
-#define ANDROIDLOGV(...)                                                  \
-  __android_log_print(ANDROID_LOG_VERBOSE, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, PADDLE_GREEN "%s\n" PADDLE_CONON, __VA_ARGS__);         \
-  fflush(stderr)
-#else
-#define ANDROIDLOGI(...)
-#define ANDROIDLOGW(...)
-#define ANDROIDLOGD(...)
-#define ANDROIDLOGE(...)
-#define ANDROIDLOGV(...)
-
-#endif
-
-enum LogLevel {
-  kLOG_ERROR,
-  kLOG_WARNING,
-  kLOG_INFO,
-  kLOG_VERBOSE,
-  kLOG_DEBUG,
-  kLOG_DEBUG1,
-  kLOG_DEBUG2,
-  kLOG_DEBUG3,
-  kLOG_DEBUG4,
-  kNO_LOG,
-};
-
-// log level
-static LogLevel log_level = kLOG_DEBUG4;
-
-static std::vector<std::string> logs{"ERROR  ", "WARNING", "INFO   ", "VERBOSE",
-                                     "DEBUG  ", "DEBUG1 ", "DEBUG2 ", "DEBUG3 ",
-                                     "DEBUG4 ", "NO     "};
-struct ToLog;
-struct Print;
-
-struct Print {
-  friend struct ToLog;
-
-  template <typename T>
-  Print &operator<<(T const &value) {
-    buffer_ << value;
-    return *this;
-  }
-
- private:
-  void print(LogLevel level) {
-    // buffer_ << std::endl;
-    if (level == kLOG_ERROR) {
-#ifdef ANDROID
-      ANDROIDLOGE(buffer_.str().c_str());
-#else
-      std::cerr << buffer_.str() << std::endl;
-#endif
-    } else if (level == kLOG_INFO) {
-#ifdef ANDROID
-      ANDROIDLOGI(buffer_.str().c_str());
-#else
-      std::cerr << buffer_.str() << std::endl;
-#endif
-    } else if (level == kLOG_VERBOSE) {
-#ifdef ANDROID
-      ANDROIDLOGV(buffer_.str().c_str());
-#else
-      std::cerr << buffer_.str() << std::endl;
-#endif
-    } else if (level == kLOG_WARNING) {
-#ifdef ANDROID
-      ANDROIDLOGW(buffer_.str().c_str());
-#else
-      std::cerr << buffer_.str() << std::endl;
-#endif
-    } else {
-#ifdef ANDROID
-      ANDROIDLOGD(buffer_.str().c_str());
-#else
-      std::cout << buffer_.str() << std::endl;
-#endif
-    }
-  }
-  std::ostringstream buffer_;
-};
-
-struct ToLog {
-  explicit ToLog(LogLevel level = kLOG_DEBUG, const std::string &info = "")
-      : level_(level) {
-    unsigned blanks =
-        (unsigned)(level > kLOG_DEBUG ? (level - kLOG_DEBUG) * 4 : 1);
-    printer_ << logs[level] << " " << info << ":" << std::string(blanks, ' ');
-  }
-
-  template <typename T>
-  ToLog &operator<<(T const &value) {
-    printer_ << value;
-    return *this;
-  }
-
-  ~ToLog() { printer_.print(level_); }
-
- private:
-  LogLevel level_;
-  Print printer_;
-};
-
-#define LOG(level)                                                           \
-  if (level > paddle_mobile::log_level) {                                    \
-    /* NOLINTNEXTLINE */                                                     \
-  } else                                                                     \
-    paddle_mobile::ToLog(                                                    \
-        level, static_cast<const std::stringstream &>(                       \
-                   std::stringstream()                                       \
-                   << "[file: "                                              \
-                   << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
-                                              : __FILE__)                    \
-                   << "] [line: " << __LINE__ << "] ")                       \
-                   .str())
-
-#define DLOG                                                          \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
-    /* NOLINTNEXTLINE */                                              \
-  } else                                                              \
-    paddle_mobile::ToLog(                                             \
-        paddle_mobile::kLOG_DEBUG,                                    \
-        static_cast<const std::stringstream &>(                       \
-            std::stringstream()                                       \
-            << "[file: "                                              \
-            << (strrchr(__FILE__, '/') ? (strrchr(__FILE__, '/') + 1) \
-                                       : __FILE__)                    \
-            << "] [line: " << __LINE__ << "] ")                       \
-            .str())
-
-#define LOGF(level, format, ...)          \
-  if (level > paddle_mobile::log_level) { \
-    /* NOLINTNEXTLINE */                  \
-  } else                                  \
-    printf(format, ##__VA_ARGS__)
-
-#define DLOGF(format, ...)                                    \
-  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
-    /* NOLINTNEXTLINE */                                      \
-  } else                                                      \
-    printf(format, ##__VA_ARGS__)
-
-#else
-
-#define ANDROIDLOGI(...)
-#define ANDROIDLOGW(...)
-#define ANDROIDLOGD(...)
-#define ANDROIDLOGE(...)
-#define ANDROIDLOGV(...)
-
-enum LogLevel {
-  kLOG_ERROR,
-  kLOG_WARNING,
-  kLOG_INFO,
-  kLOG_VERBOSE,
-  kLOG_DEBUG,
-  kLOG_DEBUG1,
-  kLOG_DEBUG2,
-  kLOG_DEBUG3,
-  kLOG_DEBUG4,
-  kNO_LOG
-};
-
-struct ToLog;
-struct Print {
-  friend struct ToLog;
-  template <typename T>
-  Print &operator<<(T const &value) {
-    return *this;
-  }
-};
-
-struct ToLog {
-  explicit ToLog(LogLevel level) {}
-
-  template <typename T>
-  ToLog &operator<<(T const &value) {
-    return *this;
-  }
-};
-
-#define LOG(level)       \
-  if (true) {            \
-    /* NOLINTNEXTLINE */ \
-  } else                 \
-    paddle_mobile::ToLog(level)
-
-#define DLOG             \
-  if (true) {            \
-    /* NOLINTNEXTLINE */ \
-  } else                 \
-    paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)
-
-#define LOGF(level, format, ...)
-
-#define DLOGF(format, ...)
-
-#endif
-
-template <typename T>
-Print &operator<<(Print &printer, const std::vector<T> &v) {
-  printer << "[ ";
-
-  for (int i = 0; i < v.size(); ++i) {
-    const auto &value = v[i];
-    printer << value << " ";
-    if (i % 10 == 9) {
-      printer << "\n";
-    }
-  }
-  printer << " ]";
-  return printer;
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/threadpool.h b/mobile/src/common/threadpool.h
deleted file mode 100644
index bf7894dd94a20f4f51df23c6355d26d6da3af01d..0000000000000000000000000000000000000000
--- a/mobile/src/common/threadpool.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-namespace paddle_mobile {
-class ThreadPool {
- public:
-  static ThreadPool& getThreadPool();
-  static int getThreadPoolThreadId();
-  explicit ThreadPool(size_t);
-  template <class F, class... Args>
-  auto enqueue(F&& f, Args&&... args)
-      -> std::future<typename std::result_of<F(Args...)>::type>;
-  ~ThreadPool();
-  int getTid(const std::thread::id& id) {
-    for (int i = 0; i < workers.size(); i++) {
-      if (workers[i].get_id() == id) {
-        return i;
-      }
-    }
-    return -1;
-  }
-
- private:
-  // need to keep track of threads so we can join them
-  std::vector<std::thread> workers;
-  // the task queue
-  std::queue<std::function<void()>> tasks;
-
-  // synchronization
-  std::mutex queue_mutex;
-  std::condition_variable condition;
-  bool stop;
-};
-
-// the constructor just launches some amount of workers
-inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
-  for (size_t i = 0; i < threads; ++i)
-    workers.emplace_back([this] {
-      for (;;) {
-        std::function<void()> task;
-        {
-          std::unique_lock<std::mutex> lock(this->queue_mutex);
-          this->condition.wait(
-              lock, [this] { return this->stop || !this->tasks.empty(); });
-          // for (;;) {
-          //     if (this->stop || !this->tasks.empty()) {
-          //         break;
-          //     }
-          //     lock.unlock();
-          //     lock.lock();
-          // }
-          if (this->stop && this->tasks.empty()) return;
-          task = std::move(this->tasks.front());
-          this->tasks.pop();
-        }
-
-        task();
-      }
-    });
-}
-
-// add new work item to the pool
-template <class F, class... Args>
-auto ThreadPool::enqueue(F&& f, Args&&... args)
-    -> std::future<typename std::result_of<F(Args...)>::type> {
-  using return_type = typename std::result_of<F(Args...)>::type;
-
-  auto task = std::make_shared<std::packaged_task<return_type()>>(
-      std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-
-  std::future<return_type> res = task->get_future();
-  {
-    std::unique_lock<std::mutex> lock(queue_mutex);
-
-    // don't allow enqueueing after stopping the pool
-    // if(stop)
-    //     throw std::runtime_error("enqueue on stopped ThreadPool");
-
-    tasks.emplace([task]() { (*task)(); });
-  }
-  condition.notify_one();
-  return res;
-}
-
-// the destructor joins all threads
-inline ThreadPool::~ThreadPool() {
-  {
-    std::unique_lock<std::mutex> lock(queue_mutex);
-    stop = true;
-  }
-  condition.notify_all();
-  for (std::thread& worker : workers) worker.join();
-}
-
-ThreadPool& ThreadPool::getThreadPool() {
-  static ThreadPool threadPool(3);
-  return threadPool;
-}
-
-int ThreadPool::getThreadPoolThreadId() {
-  return getThreadPool().getTid(std::this_thread::get_id());
-}
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/type_define.h b/mobile/src/common/type_define.h
deleted file mode 100644
index bedbd2a75e6de73b4f741cd0dd5ab7fcc36a998e..0000000000000000000000000000000000000000
--- a/mobile/src/common/type_define.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include <vector>
-
-namespace paddle_mobile {
-
-typedef enum {
-  _void = 0,
-  _float,
-  _int,
-  _uint16_t,
-  _double,
-  _int64_t,
-  _size_t,
-  _int16_t,
-  _int8_t,
-  _uint8_t,
-  _bool,
-  _string,
-  _floats = 100,
-  _ints,
-  _int64_ts,
-  _size_ts,
-  _bools,
-  _strings,
-  _const_float = 200,
-  _const_int,
-  _block = 300,
-  _tensor,
-  _lod_tensor,
-  _blocks,
-  _tensors,
-  _lod_tensors,
-  _p_block = 400,
-  _p_tensor,
-  _p_lod_tensor,
-  _p_blocks,
-  _p_tensors,
-  _p_lod_tensors,
-  _scopes = 500,
-  _selected_rows,
-  _dim0 = 600,
-  _dim1,
-  _dim2,
-  _dim3,
-  _dim4,
-  _dim5,
-  _dim6,
-  _dim7,
-  _dim8,
-  _dim9,
-#ifdef PADDLE_MOBILE_CL
-  _cl_image,
-#endif
-} kTypeId_t;
-
-template <typename T>
-struct TypeIdWrapper {
-  inline std::string name();
-  inline kTypeId_t hash_code();
-};
-
-template <typename T>
-struct type_id {
-  const kTypeId_t hash_code() const { return TypeIdWrapper<T>().hash_code(); }
-  const std::string name() const { return TypeIdWrapper<T>().name(); }
-
-  template <typename OtherType>
-  bool operator==(const type_id<OtherType> &operand) const {
-    return this->hash_code() == operand.hash_code();
-  }
-};
-
-#define OVERIDE_TYPEID_OPERATOR(oprand)                                    \
-  template <typename T>                                                    \
-  inline bool operator oprand(const kTypeId_t &t0, const type_id<T> &t1) { \
-    return t0 oprand t1.hash_code();                                       \
-  }                                                                        \
-  template <typename T>                                                    \
-  inline bool operator oprand(const type_id<T> &t0, const kTypeId_t &t1) { \
-    return t1 oprand t0.hash_code();                                       \
-  }
-
-OVERIDE_TYPEID_OPERATOR(==)
-OVERIDE_TYPEID_OPERATOR(!=)
-
-namespace framework {
-class BlockDesc;
-class Tensor;
-class LoDTensor;
-class SelectedRows;
-class Scope;
-#ifdef PADDLE_MOBILE_CL
-class CLImage;
-#endif
-
-template <int>
-struct Dim;
-}  // namespace framework
-
-#define REGISTER_TYPE_ID(Type, TypeName)                         \
-  template <>                                                    \
-  struct TypeIdWrapper<Type> {                                   \
-    inline std::string name() { return std::string(#TypeName); } \
-    inline kTypeId_t hash_code() { return kTypeId_t::TypeName; } \
-  };
-
-REGISTER_TYPE_ID(void, _void)
-REGISTER_TYPE_ID(float, _float)
-REGISTER_TYPE_ID(int, _int)
-REGISTER_TYPE_ID(uint16_t, _uint16_t)
-REGISTER_TYPE_ID(double, _double)
-REGISTER_TYPE_ID(int64_t, _int64_t)
-REGISTER_TYPE_ID(size_t, _size_t)
-REGISTER_TYPE_ID(int16_t, _int16_t)
-REGISTER_TYPE_ID(int8_t, _int8_t)
-REGISTER_TYPE_ID(uint8_t, _uint8_t)
-REGISTER_TYPE_ID(bool, _bool)
-REGISTER_TYPE_ID(std::string, _string)
-REGISTER_TYPE_ID(std::vector<float>, _floats)
-REGISTER_TYPE_ID(std::vector<int>, _ints)
-REGISTER_TYPE_ID(std::vector<int64_t>, _int64_ts)
-REGISTER_TYPE_ID(std::vector<size_t>, _size_ts)
-REGISTER_TYPE_ID(std::vector<bool>, _bools)
-REGISTER_TYPE_ID(std::vector<std::string>, _strings)
-
-REGISTER_TYPE_ID(float const, _const_float)
-REGISTER_TYPE_ID(int const, _const_int)
-
-REGISTER_TYPE_ID(framework::BlockDesc, _block)
-REGISTER_TYPE_ID(framework::Tensor, _tensor)
-REGISTER_TYPE_ID(framework::LoDTensor, _lod_tensor)
-REGISTER_TYPE_ID(std::vector<framework::BlockDesc>, _blocks)
-REGISTER_TYPE_ID(std::vector<framework::Tensor>, _tensors)
-REGISTER_TYPE_ID(std::vector<framework::LoDTensor>, _lod_tensors)
-
-REGISTER_TYPE_ID(framework::BlockDesc *, _p_block)
-REGISTER_TYPE_ID(framework::Tensor *, _p_tensor)
-REGISTER_TYPE_ID(framework::LoDTensor *, _p_lod_tensor)
-REGISTER_TYPE_ID(std::vector<framework::BlockDesc *>, _p_blocks)
-REGISTER_TYPE_ID(std::vector<framework::Tensor *>, _p_tensors)
-REGISTER_TYPE_ID(std::vector<framework::LoDTensor *>, _p_lod_tensors)
-
-REGISTER_TYPE_ID(std::vector<framework::Scope *>, _scopes);
-REGISTER_TYPE_ID(framework::SelectedRows, _selected_rows)
-REGISTER_TYPE_ID(framework::Dim<0>, _dim0)
-REGISTER_TYPE_ID(framework::Dim<1>, _dim1)
-REGISTER_TYPE_ID(framework::Dim<2>, _dim2)
-REGISTER_TYPE_ID(framework::Dim<3>, _dim3)
-REGISTER_TYPE_ID(framework::Dim<4>, _dim4)
-REGISTER_TYPE_ID(framework::Dim<5>, _dim5)
-REGISTER_TYPE_ID(framework::Dim<6>, _dim6)
-REGISTER_TYPE_ID(framework::Dim<7>, _dim7)
-REGISTER_TYPE_ID(framework::Dim<8>, _dim8)
-REGISTER_TYPE_ID(framework::Dim<9>, _dim9)
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_TYPE_ID(framework::CLImage, _cl_image)
-#endif
-}  // namespace paddle_mobile
-
-namespace std {
-
-template <>
-struct hash<paddle_mobile::kTypeId_t> {
-  size_t operator()(const paddle_mobile::kTypeId_t &t) const {
-    return std::hash<int>{}(static_cast<int>(t));
-  }
-};
-
-}  // namespace std
diff --git a/mobile/src/common/types.cpp b/mobile/src/common/types.cpp
deleted file mode 100755
index 00a4369010248586c9957e9a5d97e22a6d9ab9eb..0000000000000000000000000000000000000000
--- a/mobile/src/common/types.cpp
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/types.h"
-#include <vector>
-
-namespace paddle_mobile {
-
-const char *G_OP_TYPE_CONV = "conv2d";
-const char *G_OP_TYPE_BATCHNORM = "batch_norm";
-const char *G_OP_TYPE_INSTANCENORM = "instance_norm";
-const char *G_OP_TYPE_BOX_CODER = "box_coder";
-const char *G_OP_TYPE_CONCAT = "concat";
-const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
-const char *G_OP_TYPE_ELEMENTWISE_SUB = "elementwise_sub";
-const char *G_OP_TYPE_ELEMENTWISE_MUL = "elementwise_mul";
-const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
-const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
-const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
-const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
-const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
-const char *G_OP_TYPE_FUSION_CONV_RELU = "fusion_conv_relu";
-const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
-const char *G_OP_TYPE_FC = "fusion_fc";
-const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
-const char *G_OP_TYPE_LRN = "lrn";
-const char *G_OP_TYPE_MUL = "mul";
-const char *G_OP_TYPE_MULTICLASS_NMS = "multiclass_nms";
-const char *G_OP_TYPE_NORM = "norm";
-const char *G_OP_TYPE_POLYGON_BOX_TRANSFORM = "polygon_box_transform";
-const char *G_OP_TYPE_POOL2D = "pool2d";
-const char *G_OP_TYPE_PRIOR_BOX = "prior_box";
-const char *G_OP_TYPE_DENSITY_PRIOR_BOX = "density_prior_box";
-const char *G_OP_TYPE_RELU = "relu";
-const char *G_OP_TYPE_RELU6 = "relu6";
-const char *G_OP_TYPE_LEAKY_RELU = "leaky_relu";
-const char *G_OP_TYPE_RESHAPE = "reshape";
-const char *G_OP_TYPE_RESHAPE2 = "reshape2";
-const char *G_OP_TYPE_SCALE = "scale";
-const char *G_OP_TYPE_SIGMOID = "sigmoid";
-const char *G_OP_TYPE_SOFTMAX = "softmax";
-const char *G_OP_TYPE_TRANSPOSE = "transpose";
-const char *G_OP_TYPE_TRANSPOSE2 = "transpose2";
-const char *G_OP_TYPE_SPLIT = "split";
-const char *G_OP_TYPE_FEED = "feed";
-const char *G_OP_TYPE_FETCH = "fetch";
-const char *G_OP_TYPE_DEPTHWISE_CONV = "depthwise_conv2d";
-const char *G_OP_TYPE_IM2SEQUENCE = "im2sequence";
-const char *G_OP_TYPE_DROPOUT = "dropout";
-const char *G_OP_TYPE_FUSION_CONV_ADD_BN = "fusion_conv_add_bn";
-const char *G_OP_TYPE_FUSION_POOL_BN = "fusion_pool_bn";
-const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU =
-    "fusion_elementwise_add_relu";
-const char *G_OP_TYPE_FUSION_FC_RELU = "fusion_fc_relu";
-const char *G_OP_TYPE_REGION = "region";
-const char *G_OP_TYPE_FUSION_CONV_BN = "fusion_conv_bn";
-const char *G_OP_TYPE_CONV_TRANSPOSE = "conv2d_transpose";
-const char *G_OP_TYPE_PRELU = "prelu";
-const char *G_OP_TYPE_LOOKUP_TABLE = "lookup_table";
-const char *G_OP_TYPE_GRU = "gru";
-const char *G_OP_TYPE_GRU_UNIT = "gru_unit";
-const char *G_OP_TYPE_CRF = "crf_decoding";
-const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
-const char *G_OP_TYPE_NEAREST_INTERP = "nearest_interp";
-const char *G_OP_TYPE_FLATTEN = "flatten";
-const char *G_OP_TYPE_FLATTEN2 = "flatten2";
-const char *G_OP_TYPE_SHAPE = "shape";
-const char *G_OP_TYPE_SUM = "sum";
-const char *G_OP_TYPE_TOP_K = "top_k";
-const char *G_OP_TYPE_CAST = "cast";
-const char *G_OP_TYPE_LOG = "log";
-const char *G_OP_TYPE_LOD_RESET = "lod_reset";
-const char *G_OP_TYPE_LESS_THAN = "less_than";
-const char *G_OP_TYPE_LOGICAL_AND = "logical_and";
-const char *G_OP_TYPE_LOGICAL_OR = "logical_or";
-const char *G_OP_TYPE_LOGICAL_NOT = "logical_not";
-const char *G_OP_TYPE_LOGICAL_XOR = "logical_xor";
-const char *G_OP_TYPE_WRITE_TO_ARRAY = "write_to_array";
-const char *G_OP_TYPE_READ_FROM_ARRAY = "read_from_array";
-const char *G_OP_TYPE_IS_EMPTY = "is_empty";
-const char *G_OP_TYPE_INCREMENT = "increment";
-const char *G_OP_TYPE_EXP = "exp";
-
-const char *G_OP_TYPE_QUANTIZE = "quantize";
-const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
-const char *G_OP_TYPE_FUSION_DEQUANT_BN = "fusion_dequant_bn";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn";
-const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT =
-    "fusion_dequant_add_bn_quant";
-const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT =
-    "fusion_dequant_add_bn_relu_quant";
-
-const char *G_OP_TYPE_TANH = "tanh";
-const char *G_OP_TYPE_FUSION_DECONV_RELU = "fusion_deconv_relu";
-const char *G_OP_TYPE_FUSION_DECONV_ADD = "fusion_deconv_add";
-const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu";
-
-const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand";
-const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool";
-const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax";
-const char *G_OP_TYPE_SLICE = "slice";
-const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator";
-const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals";
-const char *G_OP_TYPE_PSROI_POOL = "psroi_pool";
-const char *G_OP_TYPE_ROIALIGN_POOL = "roialign_pool";
-const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform";
-const char *G_OP_TYPE_PAD2D = "pad2d";
-const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu";
-const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn";
-const char *G_OP_TYPE_FUSION_DECONV_BN_RELU = "fusion_deconv_bn_relu";
-const char *G_OP_TYPE_ASSIGN = "assign";
-const char *G_OP_TYPE_REDUCE_PROD = "reduce_prod";
-const char *G_OP_TYPE_EQUAL = "equal";
-const char *G_OP_TYPE_CONDITIONAL_BLOCK = "conditional_block";
-const char *G_OP_TYPE_RANGE = "range";
-const char *G_OP_TYPE_WHILE = "while";
-const char *G_OP_TYPE_BEAM_SEARCH_DECODE = "beam_search_decode";
-const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
-    "fill_constant_batch_size_like";
-const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
-const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
-const char *G_OP_TYPE_EXPAND = "expand";
-const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";
-
-std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key = {
-        {G_OP_TYPE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FUSION_DWCONV_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PRELU, {{"X", "Alpha"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_RELU6, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_LEAKY_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SCALE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SOFTMAX, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SIGMOID, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_MUL, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_ADD, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_SUB, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SPLIT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FEED, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FETCH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TRANSPOSE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TRANSPOSE2, {{"X"}, {"Out", "XShape"}}},
-        {G_OP_TYPE_BOX_CODER,
-         {{"PriorBox", "PriorBoxVar", "TargetBox"}, {"OutputBox"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_BN_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PRIOR_BOX, {{"Image", "Input"}, {"Boxes", "Variances"}}},
-        {G_OP_TYPE_DENSITY_PRIOR_BOX,
-         {{"Image", "Input"}, {"Boxes", "Variances"}}},
-        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
-        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
-        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
-        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_DROPOUT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_EXP, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_ADD_BN, {{"Input"}, {"Y"}}},
-        {G_OP_TYPE_FUSION_POOL_BN, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_FC_RELU, {{"X", "Y", "Z"}, {"Out"}}},
-        {G_OP_TYPE_REGION, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_CONV_BN, {{"Input"}, {"Y"}}},
-        {G_OP_TYPE_LOOKUP_TABLE, {{"W", "Ids"}, {"Out"}}},
-        {G_OP_TYPE_GRU,
-         {{"Input", "H0", "Weight", "Bias"},
-          {"BatchGate", "BatchResetHiddenPrev", "BatchHidden", "Hidden"}}},
-        {G_OP_TYPE_GRU_UNIT,
-         {{"Input", "HiddenPrev", "Weight", "Bias"},
-          {"Gate", "ResetHiddenPrev", "Hidden"}}},
-        {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
-        {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
-        {G_OP_TYPE_NEAREST_INTERP, {{"OutSize", "X"}, {"Out"}}},
-        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FLATTEN2, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
-        {G_OP_TYPE_SUM, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_TOP_K, {{"X"}, {"Out", "Indices"}}},
-        {G_OP_TYPE_CAST, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_BN, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT,
-         {{"X", "Scale"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT,
-         {{"X", "Scale"}, {"Out", "OutScale"}}},
-        {G_OP_TYPE_TANH, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_SEQUENCE_EXPAND, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_SEQUENCE_POOL, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SEQUENCE_SOFTMAX, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_NORM, {{"X"}, {"Out", "Norm"}}},
-        {G_OP_TYPE_LOG, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_LOD_RESET, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LESS_THAN, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_AND, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_OR, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_XOR, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_LOGICAL_NOT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_WRITE_TO_ARRAY, {{"X", "I"}, {"Out"}}},
-        {G_OP_TYPE_READ_FROM_ARRAY, {{"X", "I"}, {"Out"}}},
-        {G_OP_TYPE_IS_EMPTY, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_INCREMENT, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_SLICE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_ANCHOR_GENERATOR, {{"Input"}, {"Anchors", "Variances"}}},
-        {G_OP_TYPE_GENERATE_PROPOSALS,
-         {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"},
-          {"RpnRois", "RpnRoiProbs"}}},
-        {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_ROIALIGN_POOL, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_FUSION_DECONV_BN_RELU, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_REDUCE_PROD, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_ASSIGN, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_EQUAL, {{"X", "Y"}, {"Out"}}},
-        {G_OP_TYPE_RANGE, {{"Start", "End", "Step"}, {"Out"}}},
-        {G_OP_TYPE_CONDITIONAL_BLOCK, {{"Input", "Cond"}, {"Out", "Scope"}}},
-        {G_OP_TYPE_WHILE, {{"Condition", "X"}, {"Out", "StepScopes"}}},
-        {G_OP_TYPE_BEAM_SEARCH_DECODE,
-         {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
-        {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/types.h b/mobile/src/common/types.h
deleted file mode 100644
index cc49182adb75be6d81d403971d53dca6f0b46627..0000000000000000000000000000000000000000
--- a/mobile/src/common/types.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace paddle_mobile {
-enum class Precision : int { FP32 = 0, FP16 = 1 };
-
-typedef int16_t half;
-
-template <Precision p>
-struct PrecisionTrait {
-  typedef void ptype;
-};
-
-template <>
-struct PrecisionTrait<Precision::FP32> {
-  typedef float ptype;
-};
-template <>
-struct PrecisionTrait<Precision::FP16> {
-  typedef half ptype;
-};
-
-//! device type
-enum DeviceTypeEnum {
-  kINVALID = -1,
-  kCPU = 0,
-  kFPGA = 1,
-  kGPU_MALI = 2,
-  kGPU_CL = 3
-};
-
-template <DeviceTypeEnum T>
-struct DeviceType {};
-
-typedef DeviceType<kCPU> CPU;
-typedef DeviceType<kFPGA> FPGA;
-typedef DeviceType<kGPU_CL> GPU_CL;
-
-//! data type
-enum DataType {
-  PM_INVALID = -1,
-  PM_HALF = 0,
-  PM_FLOAT = 1,
-  PM_DOUBLE = 2,
-  PM_INT8 = 3,
-  PM_INT16 = 4,
-  PM_INT32 = 5,
-  PM_INT64 = 6,
-  PM_UINT8 = 7,
-  PM_UINT16 = 8,
-  PM_UINT32 = 9,
-  PM_STRING = 10,
-  PM_BOOL = 11,
-  PM_SHAPE = 12,
-  PM_TENSOR = 13
-};
-//!
-enum PMStatus {
-  PMSuccess = 0xFF,        /*!< No errors */
-  PMNotInitialized = 0x01, /*!< Data not initialized. */
-  PMInvalidValue = 0x02,   /*!< Incorrect variable value. */
-  PMMemAllocFailed = 0x03, /*!< Memory allocation error. */
-  PMUnKownError = 0x04,    /*!< Unknown error. */
-  PMOutOfAuthority = 0x05, /*!< Try to modified data not your own*/
-  PMOutOfMem = 0x06,       /*!< OOM error*/
-  PMUnImplError = 0x07,    /*!< Unimplement error. */
-  PMWrongDevice = 0x08,    /*!< un-correct device. */
-  PMException = 0x09       /*!< throw exception. */
-};
-
-enum PrePostType {
-  NONE_PRE_POST = 0,
-  UINT8_255 = 1,
-};
-
-enum RoundType {
-  ROUND_NEAREST_AWAY_ZERO = 0,
-  ROUND_NEAREST_TOWARDS_ZERO = 1,
-  ROUND_NEAREST_TO_EVEN = 2,
-};
-
-enum ActivationType {
-  IDENTITY = 0,
-  RELU = 1,
-  RELU6 = 2,
-  PRELU = 3,
-  LEAKY_RELU = 4,
-  TANH = 5,
-  SIGMOID = 6,
-  LOG = 7,
-};
-
-enum PoolingType {
-  MAX = 0,
-  AVG = 1,
-  SUM = 2,
-  FIRST = 3,
-  LAST = 4,
-};
-
-enum PowerMode {
-  PERFORMANCE_PRIORITY = 0,  // let threads run on big cores if
-                             // thread_num <= big_cores_num,
-                             // otherwise the power mode will be
-                             // set to AUTO and all threads are
-                             // scheduled by system
-  EFFICIENCY_PRIORITY = 1,   // let threads run on little cores if
-                             // thread_num <= little_cores_num,
-                             // otherwise the power mode will be
-                             // set to AUTO and all threads are
-                             // scheduled by system
-  PERFORMANCE_ONLY = 2,      // force threads run on big cores,
-                             // and the remains are ignored if
-                             // exceed the number big cores
-  EFFICIENCY_ONLY = 3,       // force threads run on little cores,
-                             // and the remains are ignored if
-                             // exceed the number of little cores
-  AUTO = 4,                  // scheduled by system
-};
-
-enum MemoryOptimizationLevel {
-  NoMemoryOptimization = 0,
-  MemoryOptimizationWithoutFeeds = 1,
-  FullMemoryOptimization = 2,
-};
-
-struct PaddleMobileConfigInternal {
-  bool load_when_predict = false;
-  MemoryOptimizationLevel memory_optimization_level =
-      MemoryOptimizationWithoutFeeds;
-  std::string model_obfuscate_key = "";
-  PrePostType pre_post_type = NONE_PRE_POST;
-};
-
-enum ARMArch {
-  APPLE = 0,
-  A53 = 53,
-  A55 = 55,
-  A57 = 57,
-  A72 = 72,
-  A73 = 73,
-  A75 = 75,
-  A76 = 76,
-  ARM_UNKOWN = -1
-};
-
-extern const char *G_OP_TYPE_CONV;
-extern const char *G_OP_TYPE_BATCHNORM;
-extern const char *G_OP_TYPE_INSTANCENORM;
-extern const char *G_OP_TYPE_BOX_CODER;
-extern const char *G_OP_TYPE_CONCAT;
-extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
-extern const char *G_OP_TYPE_ELEMENTWISE_SUB;
-extern const char *G_OP_TYPE_ELEMENTWISE_MUL;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
-extern const char *G_OP_TYPE_FC;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD;
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_CONV_RELU;
-
-extern const char *G_OP_TYPE_GRU;
-extern const char *G_OP_TYPE_GRU_UNIT;
-extern const char *G_OP_TYPE_CRF;
-extern const char *G_OP_TYPE_BILINEAR_INTERP;
-extern const char *G_OP_TYPE_NEAREST_INTERP;
-extern const char *G_OP_TYPE_FLATTEN;
-extern const char *G_OP_TYPE_FLATTEN2;
-extern const char *G_OP_TYPE_SHAPE;
-extern const char *G_OP_TYPE_LRN;
-extern const char *G_OP_TYPE_MUL;
-extern const char *G_OP_TYPE_MULTICLASS_NMS;
-extern const char *G_OP_TYPE_NORM;
-extern const char *G_OP_TYPE_POOL2D;
-extern const char *G_OP_TYPE_PRIOR_BOX;
-extern const char *G_OP_TYPE_RELU;
-extern const char *G_OP_TYPE_RELU6;
-extern const char *G_OP_TYPE_LEAKY_RELU;
-extern const char *G_OP_TYPE_RESHAPE;
-extern const char *G_OP_TYPE_SCALE;
-extern const char *G_OP_TYPE_SIGMOID;
-extern const char *G_OP_TYPE_SOFTMAX;
-extern const char *G_OP_TYPE_TRANSPOSE;
-extern const char *G_OP_TYPE_SPLIT;
-extern const char *G_OP_TYPE_FEED;
-extern const char *G_OP_TYPE_FETCH;
-extern const char *G_OP_TYPE_DEPTHWISE_CONV;
-extern const char *G_OP_TYPE_IM2SEQUENCE;
-extern const char *G_OP_TYPE_DROPOUT;
-
-extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN;
-extern const char *G_OP_TYPE_FUSION_POOL_BN;
-extern const char *G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU;
-extern const char *G_OP_TYPE_FUSION_FC_RELU;
-extern const char *G_OP_TYPE_REGION;
-extern const char *G_OP_TYPE_FUSION_CONV_BN;
-extern const char *G_OP_TYPE_CONV_TRANSPOSE;
-extern const char *G_OP_TYPE_PRELU;
-extern const char *G_OP_TYPE_SUM;
-extern const char *G_OP_TYPE_TOP_K;
-extern const char *G_OP_TYPE_CAST;
-extern const char *G_OP_TYPE_LOG;
-extern const char *G_OP_TYPE_LOD_RESET;
-extern const char *G_OP_TYPE_LESS_THAN;
-extern const char *G_OP_TYPE_LOGICAL_AND;
-extern const char *G_OP_TYPE_LOGICAL_OR;
-extern const char *G_OP_TYPE_LOGICAL_NOT;
-extern const char *G_OP_TYPE_LOGICAL_XOR;
-extern const char *G_OP_TYPE_WRITE_TO_ARRAY;
-extern const char *G_OP_TYPE_READ_FROM_ARRAY;
-extern const char *G_OP_TYPE_IS_EMPTY;
-extern const char *G_OP_TYPE_INCREMENT;
-
-extern const char *G_OP_TYPE_QUANTIZE;
-extern const char *G_OP_TYPE_DEQUANTIZE;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_BN;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT;
-extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT;
-
-extern const char *G_OP_TYPE_TANH;
-extern const char *G_OP_TYPE_FUSION_DECONV_RELU;
-
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD;
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU;
-
-extern const char *G_OP_TYPE_SEQUENCE_EXPAND;
-extern const char *G_OP_TYPE_SEQUENCE_POOL;
-extern const char *G_OP_TYPE_SEQUENCE_SOFTMAX;
-
-extern const char *G_OP_TYPE_SLICE;
-extern const char *G_OP_TYPE_ANCHOR_GENERATOR;
-extern const char *G_OP_TYPE_GENERATE_PROPOSALS;
-extern const char *G_OP_TYPE_PSROI_POOL;
-extern const char *G_OP_TYPE_ROIALIGN_POOL;
-extern const char *G_OP_TYPE_ROI_PERSPECTIVE;
-extern const char *G_OP_TYPE_PAD2D;
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
-extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
-extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
-extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
-extern const char *G_OP_TYPE_EXPAND;
-extern const char *G_OP_TYPE_GRID_SAMPLER;
-
-extern std::unordered_map<
-    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
-    op_input_output_key;
-
-typedef std::map<std::string, std::vector<std::string>> VariableNameMap;
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/util.cpp b/mobile/src/common/util.cpp
deleted file mode 100644
index acdc42e87926c3c66d9a855e27966d41cf43d5b5..0000000000000000000000000000000000000000
--- a/mobile/src/common/util.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/util.h"
-
-namespace paddle_mobile {
-
-char *ReadFileToBuff(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
-  rewind(file);
-  char *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-int GetFileLength(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
-  fclose(file);
-  return size;
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/util.h b/mobile/src/common/util.h
deleted file mode 100644
index 212362a52e75edcdf9eca8857d197e7627e65f02..0000000000000000000000000000000000000000
--- a/mobile/src/common/util.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "common/enforce.h"
-
-namespace paddle_mobile {
-
-char *ReadFileToBuff(std::string filename);
-
-int GetFileLength(std::string filename);
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/common/variant.h b/mobile/src/common/variant.h
deleted file mode 100644
index 63795468ff030c22dda776e601fa8a35136b8766..0000000000000000000000000000000000000000
--- a/mobile/src/common/variant.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-#include "common/enforce.h"
-#include "common/log.h"
-#include "common/type_define.h"
-
-namespace paddle_mobile {
-
-template <int ID, typename Type>
-struct IDToType {
-  typedef Type type_t;
-};
-
-template <typename F, typename... Ts>
-struct VariantHelper {
-  inline static void Destroy(kTypeId_t type, void *raw_ptr) {
-    if (type == type_id<F>()) {
-      auto ptr = reinterpret_cast<F *>(raw_ptr);
-      delete ptr;
-    } else {
-      VariantHelper<Ts...>::Destroy(type, raw_ptr);
-    }
-  }
-};
-
-template <typename F>
-struct VariantHelper<F> {
-  inline static void Destroy(kTypeId_t type, void *raw_ptr) {
-    if (type == type_id<F>()) {
-      auto ptr = reinterpret_cast<F *>(raw_ptr);
-      delete ptr;
-    }
-  }
-};
-
-template <typename... Ts>
-struct VariantDeleter {
-  kTypeId_t type_ = type_id<void>().hash_code();
-  explicit VariantDeleter(kTypeId_t type) { type_ = type; }
-  void operator()(void *raw_ptr) {
-    // DLOG << "variant delete: " << type_ << " " << raw_ptr;
-    VariantHelper<Ts...>::Destroy(type_, raw_ptr);
-  }
-};
-
-template <typename... Ts>
-struct Variant {
-  Variant() : type_(invalid_type()) {}
-
-  Variant(const Variant &variant) {
-    type_ = variant.type_;
-    data_ = variant.data_;
-  }
-
-  virtual ~Variant() {
-    // DLOG << "variant deinit: " << type_ << " " << (void *)data_.get();
-    data_.reset();
-  }
-
-  template <typename T, typename... Args>
-  void Set(Args &&... args) {
-    auto raw_ptr = new T(std::forward<Args>(args)...);
-    type_ = type_id<T>().hash_code();
-    // DLOG << "variant new: " << type_ << " " << (void *)raw_ptr;
-    data_.reset(raw_ptr, VariantDeleter<Ts...>(type_));
-  }
-
-  template <typename T>
-  T &Get() const {
-    return *const_cast<T *>(reinterpret_cast<const T *>(data_.get()));
-  }
-
-  kTypeId_t TypeId() const { return type_; }
-
- private:
-  static inline kTypeId_t invalid_type() { return type_id<void>().hash_code(); }
-  typedef VariantHelper<Ts...> helper;
-  kTypeId_t type_ = type_id<void>().hash_code();
-  std::shared_ptr<void> data_;
-};
-
-template <typename T>
-struct Vistor {
-  typedef T type_t;
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/alignment.h b/mobile/src/fpga/KD/alignment.h
deleted file mode 100644
index 4df852f5fd8ea40b95c0d36237d26f4670fd75ae..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/alignment.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef alignment_h
-#define alignment_h
-
-#include <stdio.h>
-
-#include "llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-inline int align_image(int wc) { return align_to_x(wc, IMAGE_ALIGNMENT); }
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* alignment_h */
diff --git a/mobile/src/fpga/KD/context.hpp b/mobile/src/fpga/KD/context.hpp
deleted file mode 100644
index e7c106ff8c202050cbc76be07521bd46f027ba64..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/context.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef Context_hpp
-#define Context_hpp
-
-#include <stdio.h>
-#include "pe.hpp"
-#include "pes/conv_pe.hpp"
-#include "pes/depthwise_conv_pe.hpp"
-#include "pes/fully_connected_pe.hpp"
-#include "pes/input_pe.hpp"
-#include "pes/output_pe.hpp"
-#include "pes/pooling_pe.hpp"
-#include "pes/softmax_pe.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class Context {
- public:
-  template <typename Ptype>
-  Ptype& pe() {
-    if (pe_ == nullptr) {
-      pe_ = new Ptype();
-    }
-    return static_cast<Ptype&>(*pe_);
-  }
-
-  ~Context() {
-    if (pe_ != nullptr) {
-      delete pe_;
-    }
-  }
-
- private:
-  PE* pe_ = nullptr;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* Context_hpp */
diff --git a/mobile/src/fpga/KD/dl_engine.cpp b/mobile/src/fpga/KD/dl_engine.cpp
deleted file mode 100644
index a8923fd6c5eea5436bb33c01f7eea2b9ed482c78..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/dl_engine.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "dl_engine.hpp"
diff --git a/mobile/src/fpga/KD/float16.hpp b/mobile/src/fpga/KD/float16.hpp
deleted file mode 100644
index f3d5c6637b7bdb94f45e4021c01fdb29d7279aa6..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/float16.hpp
+++ /dev/null
@@ -1,506 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-typedef uint16_t float16;
-
-static const uint32_t mantissatable[2048] = {
-    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
-    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
-    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
-    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
-    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
-    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
-    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
-    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
-    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
-    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
-    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
-    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
-    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
-    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
-    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
-    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
-    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
-    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
-    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
-    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
-    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
-    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
-    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
-    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
-    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
-    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
-    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
-    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
-    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
-    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
-    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
-    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
-    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
-    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
-    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
-    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
-    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
-    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
-    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
-    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
-    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
-    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
-    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
-    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
-    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
-    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
-    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
-    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
-    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
-    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
-    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
-    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
-    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
-    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
-    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
-    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
-    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
-    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
-    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
-    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
-    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
-    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
-    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
-    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
-    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
-    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
-    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
-    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
-    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
-    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
-    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
-    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
-    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
-    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
-    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
-    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
-    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
-    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
-    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
-    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
-    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
-    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
-    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
-    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
-    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
-    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
-    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
-    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
-    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
-    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
-    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
-    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
-    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
-    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
-    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
-    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
-    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
-    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
-    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
-    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
-    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
-    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
-    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
-    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
-    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
-    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
-    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
-    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
-    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
-    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
-    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
-    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
-    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
-    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
-    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
-    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
-    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
-    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
-    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
-    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
-    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
-    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
-    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
-    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
-    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
-    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
-    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
-    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
-    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
-    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
-    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
-    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
-    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
-    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
-    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
-    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
-    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
-    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
-    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
-    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
-    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
-    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
-    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
-    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
-    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
-    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
-    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
-    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
-    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
-    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
-    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
-    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
-    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
-    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
-    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
-    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
-    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
-    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
-    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
-    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
-    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
-    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
-    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
-    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
-    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
-    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
-    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
-    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
-    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
-    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
-    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
-    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
-    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
-    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
-    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
-    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
-    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
-    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
-    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
-    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
-    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
-    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
-    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
-    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
-    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
-    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
-    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
-    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
-    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
-    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
-    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
-    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
-    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
-    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
-    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
-    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
-    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
-    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
-    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
-    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
-    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
-    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
-    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
-    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
-    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
-    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
-    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
-    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
-    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
-    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
-    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
-    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
-    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
-    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
-    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
-    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
-    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
-    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
-    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
-    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
-    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
-    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
-    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
-    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
-    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
-    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
-    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
-    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
-    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
-    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
-    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
-    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
-    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
-    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
-    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
-    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
-    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
-    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
-    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
-    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
-    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
-    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
-    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
-    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
-    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
-    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
-    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
-    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
-    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
-    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
-    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
-    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
-    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
-    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
-    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
-    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
-    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
-    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
-    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
-    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
-    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
-    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
-    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
-    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
-    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
-    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
-    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
-    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
-    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
-    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
-    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
-    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
-    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
-    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
-    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
-    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
-    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
-    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
-    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
-    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
-    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
-    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
-    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
-    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
-    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
-    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
-    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
-    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
-    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
-    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
-    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
-    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
-    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
-    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
-    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
-    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
-    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
-    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
-    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
-    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
-    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
-    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
-    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
-    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
-    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
-    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
-    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
-    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
-    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
-    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
-    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
-    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
-    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
-    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
-    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
-    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
-    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
-    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
-    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
-    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
-    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
-    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
-    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
-    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
-    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
-    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
-    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
-    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
-    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
-    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
-    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
-    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
-    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
-    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
-    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
-    0x387fc000, 0x387fe000};
-
-static const uint16_t offsettable[64] = {
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
-
-static const uint32_t exponenttable[64] = {
-    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
-    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
-    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
-    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
-    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
-    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
-    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
-    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
-    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
-    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
-
-static const uint16_t basetable[512] = {
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
-    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
-    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
-    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
-    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
-    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
-    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
-    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
-    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
-    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
-
-static const uint8_t shifttable[512] = {
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
-    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
-    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
-
-inline float16 float_to_half(float f) {
-  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
-  return basetable[(v >> 23) & 0x1ff] +
-         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
-}
-
-inline float half_to_float(float16 h) {
-  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
-               exponenttable[h >> 10];
-  return *reinterpret_cast<float *>(&v);
-}
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/layout.hpp b/mobile/src/fpga/KD/layout.hpp
deleted file mode 100644
index 8df0d11d3b75cdb3b0fcf138538e1fb0ec27149e..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/layout.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "fpga/KD/alignment.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-enum LayoutType {
-  N,
-  NC,
-  NCHW,
-  NHWC,
-  NHW,
-};
-
-class Layout {
- public:
-  virtual int numIndex() = 0;
-  virtual int channelIndex() { return -1; }
-  virtual int heightIndex() { return -1; }
-  virtual int widthIndex() { return -1; }
-  virtual int alignedElementCount(const std::vector<int>& dims) = 0;
-  virtual int elementCount(const std::vector<int>& dims) = 0;
-};
-
-struct NCHW : Layout {
-  int numIndex() { return 0; }
-  int channelIndex() { return 1; }
-  int heightIndex() { return 2; }
-  int widthIndex() { return 3; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[2] * align_image(dims[1] * dims[3]);
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2] * dims[3];
-  }
-};
-
-struct NHWC : Layout {
-  int numIndex() { return 0; }
-  int heightIndex() { return 1; }
-  int widthIndex() { return 2; }
-  int channelIndex() { return 3; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * align_image(dims[2] * dims[3]);
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2] * dims[3];
-  }
-};
-
-struct NC : Layout {
-  int numIndex() { return 0; }
-  int channelIndex() { return 1; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1];
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1];
-  }
-};
-
-struct N : Layout {
-  int numIndex() { return 0; }
-  int alignedElementCount(const std::vector<int>& dims) { return dims[0]; }
-  virtual int elementCount(const std::vector<int>& dims) { return dims[0]; }
-};
-
-struct NHW : Layout {
-  int numIndex() { return 0; }
-  int heightIndex() { return 1; }
-  int widthIndex() { return 2; }
-  int alignedElementCount(const std::vector<int>& dims) {
-    // TODO(chonwhite) align it;
-    return dims[0] * dims[1] * dims[2];
-  }
-  virtual int elementCount(const std::vector<int>& dims) {
-    return dims[0] * dims[1] * dims[2];
-  }
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/bias_scale.cpp b/mobile/src/fpga/KD/llapi/bias_scale.cpp
deleted file mode 100644
index 612c86871c946f3db234499d064153c55c678df6..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/bias_scale.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory.h>
-
-#include "fpga/KD/llapi/bias_scale.h"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  float *ptr_uninterleaved = *data_in;
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-           4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    float value = ptr_aligned[i];
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/bias_scale.h b/mobile/src/fpga/KD/llapi/bias_scale.h
deleted file mode 100644
index 66f05cc647201d2797cb7ff05d2f78b695baafa3..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/bias_scale.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/config.h b/mobile/src/fpga/KD/llapi/config.h
deleted file mode 100755
index be919489fb4370ebd893aabf416769f49a980e35..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/config.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_MOBILE_ZU5
-#define FPGA_PRINT_MODE
-#define PADDLE_MOBILE_PROFILE
diff --git a/mobile/src/fpga/KD/llapi/filter.cpp b/mobile/src/fpga/KD/llapi/filter.cpp
deleted file mode 100644
index f9e5717e32143b0af5dd3279b3e034a334bc538d..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/filter.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/KD/llapi/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/KD/float16.hpp"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace filter {
-
-int calc_division_capacity(int chw) {
-  int n = 2048 / ((chw + 15) / 16) * 32;
-  return n < 2048 ? n : 2048;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(char **data_in, int num, int channel, int height,
-                    int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in, int num_per_div_before_alignment, int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index = 0;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  std::cout << "interleave size:" << chw_align * num_after_alignment
-            << std::endl;
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
-}
-
-size_t format_filter(float **data_in, int num, int channel, int height,
-                     int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-  return mem_size;
-}
-
-void convert_fc_filter(char **data_in, int num, int chw) {
-  char *tmp = *data_in;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < chw; c++) {
-      data_tmp[n * chw + c] = (*data_in)[num * c + n];
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_fc_filter(float **data_in, int num, int channel, int height,
-                      int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_fc_filter(quantize_data, num, chw);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  if (unalign_n == align_n) {
-    return;
-  } else {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-    *data_in = data_tmp;
-    free(tmp);
-  }
-}
-void quantize_to_fp16(float **data_in, int num, int height, int width,
-                      float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        float value = tmp[index] * scale_val;
-        tmp_data[index] = float_to_half(value);
-      }
-    }
-  }
-  fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-void format_dwconv_filter(float **data_in, int num, int height, int width,
-                          float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-}  // namespace filter
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/filter.h b/mobile/src/fpga/KD/llapi/filter.h
deleted file mode 100644
index 80c027a1048cd6b59a6ca5830b809167623c6277..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/filter.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstdlib>
-#include <cwchar>
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace filter {
-
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(char** data_in, int num, int channel, int height,
-                    int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in, int num_per_div_before_alignment, int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-size_t interleave(char** data_in, int num_after_alignment, int chw);
-size_t format_filter(float** data_in, int num, int channel, int height,
-                     int width, int group_num, float max);
-
-void convert_fc_filter(char** data_in, int num, int chw);
-void format_fc_filter(float** data_in, int num, int channel, int height,
-                      int width, int group_num, float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-void align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(float** data_in, int num, int height, int width,
-                      float* scale_ptr);
-void format_dwconv_filter(float** data_in, int num, int height, int width,
-                          float* scale_ptr);
-
-}  // namespace filter
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/image.cpp b/mobile/src/fpga/KD/llapi/image.cpp
deleted file mode 100644
index d44d25420a0156aabe1a57a8752c1a649bc2cbe3..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/image.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory.h>
-#include <algorithm>
-
-#include "fpga/KD/llapi/image.h"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace image {
-
-void convert_to_hwc(float **data_in, int channel, int height, int width) {
-  float *tmp = *data_in;
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_row = width * channel;
-  for (int c = 0; c < channel; c++) {
-    for (int h = 0; h < height; h++) {
-      int64_t offset_height = h * amount_per_row;
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + offset_height + w * channel + c) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_conv(float **data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    float *tmp = *data_in;
-    float *data_tmp =
-        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
-
-    memset(data_tmp, 0, height * align_cw * sizeof(float));
-
-    for (h = 0; h < height; h++) {
-      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
-             (void *)(*data_in + h * cw),        // NOLINT
-             cw * sizeof(float));
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void format_image(float **data_in, int channel, int height, int width) {
-  // convert_to_hwc(data_in, channel, height, width);
-  align_element_conv(data_in, height, channel * width);
-  fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
-                           sizeof(float));
-}
-
-void concat_images(int16_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, uint32_t *channel_num,
-                   int height, int width) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int each_out_line_channel = 0;
-  int align_each_out_area_cw = 0;
-  int align_each_in_area_cw = 0;
-  int align_each_out_area_cw_differ = 0;
-  int tmp_channel = 0;
-  scale_out[0] = 0.0;
-  scale_out[1] = 0.0;
-  for (i = 0; i < image_num; i++) {
-    each_out_line_channel += channel_num[i];
-    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
-    // fpga_invalidate(images_in[i],
-    //                 height *
-    //                     align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-    //                     sizeof(int16_t));
-  }
-  scale_out[1] = 1 / scale_out[0];
-  align_each_out_area_cw =
-      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
-  align_each_out_area_cw_differ =
-      align_each_out_area_cw - each_out_line_channel * width;
-
-  for (k = 0; k < height; k++) {
-    for (j = 0; j < width; j++) {
-      for (i = 0; i < image_num; i++) {
-        align_each_in_area_cw =
-            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
-                   k * align_each_out_area_cw_differ,
-               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
-               channel_num[i] * sizeof(int16_t));
-
-        tmp_channel += channel_num[i];
-      }
-    }
-  }
-  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
-}
-
-void split_image(int16_t *image_in, const float *scale_in, void **images_out,
-                 float **scales_out, int image_num,
-                 const uint32_t *channel_nums, int height, int width) {
-  int total_channel = 0;
-  for (int i = 0; i < image_num; i++) {
-    scales_out[i][0] = scale_in[0];
-    scales_out[i][1] = scale_in[1];
-    total_channel += channel_nums[i];
-  }
-  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
-  fpga_invalidate(image_in, element_num * sizeof(int16_t));
-
-  int src_offset = 0;
-  int des_offset = 0;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
-                   w * total_channel;
-      for (int i = 0; i < image_num; i++) {
-        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
-                     w * channel_nums[i];
-        memcpy(reinterpret_cast<int16_t *>(images_out[i] + des_offset),
-               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
-        src_offset += channel_nums[i];
-      }
-    }
-  }
-
-  for (int i = 0; i < image_num; i++) {
-    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
-    fpga_flush(images_out[i], element_num * sizeof(int16_t));
-  }
-}
-
-}  // namespace image
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/image.h b/mobile/src/fpga/KD/llapi/image.h
deleted file mode 100644
index d01877397ad025e52bc192fd1a5d7cc2b0418c3a..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/image.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-
-namespace paddle_mobile {
-namespace zynqmp {
-namespace image {
-
-void convert_to_hwc(float** data_in, int channel, int height, int width);
-void align_element_conv(float** data_in, int height, int cw);
-void format_image(float** data_in, int channel, int height, int width);
-
-// Concat featuremaps along channel direction
-void concat_images(int16_t** images_in, float** scales_in, void* image_out,
-                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height, int width);
-
-// Split featuremap along channel direction
-void split_image(int16_t* image_in, const float* scale_in, void** images_out,
-                 float** scales_out, int image_num,
-                 const uint32_t* channel_nums, int height, int width);
-}  // namespace image
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp b/mobile/src/fpga/KD/llapi/zynqmp_api.cpp
deleted file mode 100644
index ec6ee9f33117f177f4fe4da247f982d74caa0ee4..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/zynqmp_api.cpp
+++ /dev/null
@@ -1,384 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fcntl.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <cstring>
-#include <map>
-
-#include "fpga/KD/llapi/config.h"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-#define PADDLE_MOBILE_OS_LINUX
-
-static int fd = -1;
-static const char *device_path = "/dev/fpgadrv0";
-static std::map<void *, size_t> memory_map;
-
-static size_t memory_size_max = 0;
-static size_t memory_size = 0;
-
-static inline int do_ioctl(uint64_t req, const void *arg) {
-#ifdef PADDLE_MOBILE_OS_LINUX
-  return ioctl(fd, req, arg);
-#else
-  return -1;
-#endif
-}
-
-int open_device() {
-  std::cout << "open_device" << std::endl;
-  if (fd == -1) {
-    fd = open(device_path, O_RDWR);
-  }
-  std::cout << "open_device fd:" << fd << std::endl;
-  return fd;
-}
-
-void close_device() { close(fd); }
-
-void reset_device() {
-  FpgaResetArgs args;
-  do_ioctl(IOCTL_FPGA_RESET, &args);
-}
-
-// memory management;
-void *fpga_malloc(size_t size) {
-// std::cout << "fpga malloc: 0x" << std::hex << size  << std::dec << "  (" <<
-// size << ") - ";
-#ifdef ENABLE_DEBUG
-// std::cout << "fpga_malloc:" << size << std::endl;
-#endif
-#ifdef PADDLE_MOBILE_OS_LINUX
-  void *ptr = reinterpret_cast<void *>(
-      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
-  if (ptr == NULL) {
-    std::cout << "not enough memory !";
-    exit(-1);
-  }
-  // std::cout << std::hex << ptr << std::dec << std::endl;
-  memory_map.insert(std::make_pair(ptr, size));
-  memory_size += size;
-  if (memory_size > memory_size_max) {
-    memory_size_max = memory_size;
-  }
-  return ptr;
-#else
-  return malloc(size);
-#endif
-}
-
-size_t fpga_get_memory_size(void *ptr) { return memory_map[ptr]; }
-
-size_t fpga_get_memory_size_max() { return memory_size_max; }
-
-size_t fpga_diagnose_memory(int detailed) {
-  size_t total = 0;
-  //        size_t size = 0;
-  //        int i = 0;
-  auto iter = memory_map.begin();  // std::map<void *, size_t>::iterator
-  while (iter != memory_map.end()) {
-    total += iter->second;
-    iter++;
-  }
-  return total;
-}
-
-void fpga_free(void *ptr) {
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-  }
-
-  memory_size -= size;
-
-#ifdef PADDLE_MOBILE_OS_LINUX
-
-  munmap(ptr, size);
-#else
-  free(ptr);
-#endif
-}
-
-void fpga_copy(void *dst, const void *src, int size) { memcpy(dst, src, size); }
-
-int fpga_flush(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate(void *address, size_t size) {
-  // std::cout <<
-  // "=================================================================================="
-  // << std::endl;
-  struct MemoryCacheArgs args;
-  args.address = address;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-int invalidate_cache(void *addr, int size) {
-  struct MemoryCacheArgs args;
-  args.address = addr;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-int flush_cache(void *addr, int size) {
-  struct MemoryCacheArgs args;
-  args.address = addr;
-  args.size = size;
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-void fpga_copy(void *dest, const void *src, size_t num) {
-  memcpy(dest, src, num);
-}
-
-int ioctl_conv(const struct ConvArgs &args) {
-#ifdef ENABLE_DEBUG
-//        std::cout << "======Compute Basic Conv======";
-//        std::cout << "   relu_enabled:" << args.relu_enabled
-//       << "   sb_address:" << args.sb_address
-//       << "   filter_address:" << args.filter_address
-//       << "   filter_num:" << args.filter_num
-//       << "   group_num:" << args.group_num;
-//  std::cout << "   image_address:" << args.image.address
-//       << "   image_scale_address:" << args.image.scale_address
-//       << "   image_channels:" << args.image.channels
-//       << "   image_height:" << args.image.height
-//       << "   image_width:" << args.image.width
-//       << "   pad_height:" << args.image.pad_height
-//       << "   pad_width:" << args.image.pad_width;
-//  std::cout << "   kernel_height:" << args.kernel.height
-//       << "   kernel_width:" << args.kernel.width
-//       << "   stride_h:" << args.kernel.stride_h
-//       << "   stride_w:" << args.kernel.stride_w;
-//  std::cout << "   out_address:" << args.output.address
-//       << "   out_scale_address:" << args.output.scale_address;
-//
-//       float* in_scale = (float*)args.image.scale_address;
-//       std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
-//       std::endl;
-
-#endif
-
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-
-  // return 0;
-}
-
-int compute_fpga_conv_basic(const struct ConvArgs &args) {
-#ifdef ENABLE_DEBUG
-
-//        std::cout << "======Compute Basic Conv======";
-//        std::cout << "   relu_enabled:" << args.relu_enabled
-//       << "   sb_address:" << args.sb_address
-//       << "   filter_address:" << args.filter_address
-//       << "   filter_num:" << args.filter_num
-//       << "   group_num:" << args.group_num;
-//  std::cout << "   image_address:" << args.image.address
-//       << "   image_scale_address:" << args.image.scale_address
-//       << "   image_channels:" << args.image.channels
-//       << "   image_height:" << args.image.height
-//       << "   image_width:" << args.image.width
-//       << "   pad_height:" << args.image.pad_height
-//       << "   pad_width:" << args.image.pad_width;
-//  std::cout << "   kernel_height:" << args.kernel.height
-//       << "   kernel_width:" << args.kernel.width
-//       << "   stride_h:" << args.kernel.stride_h
-//       << "   stride_w:" << args.kernel.stride_w;
-//  std::cout << "   out_address:" << args.output.address
-//       << "   out_scale_address:" << args.output.scale_address;
-
-// float *in_scale = (float *)args.image.scale_address;
-//        std::cout << " scale:" << in_scale[0] << "," << in_scale[1] <<
-//        std::endl;
-
-// float *filter_scale = (float *)args.filter_scale_address;
-//        std::cout << " filter scale:" << filter_scale[0] << "," <<
-//        filter_scale[1] << std::endl;
-
-#endif
-  return do_ioctl(IOCTL_CONFIG_CONV, &args);
-}
-
-int compute_fpga_conv(const struct SplitConvArgs &args) {
-  // return do_ioctl(IOCTL_CONFIG_CONV, &args);
-  int split_num = args.split_num;
-  int ret = -1;
-  for (int i = 0; i < split_num; i++) {
-    // ComputeBasicConv(args.conv_args[i]);
-    ret = compute_fpga_conv_basic(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
-    exit(-1);
-  }
-  return ret;
-}
-
-int compute_fpga_pool(const struct PoolingArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_POOLING, &args);
-}
-
-int compute_fpga_ewadd(const struct EWAddArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_EW, &args);
-}
-
-int perform_bypass(const struct BypassArgs &args) {
-  int size = args.image.channels * args.image.width * args.image.height;
-  int max_size = 1 << 21;
-
-  float times = 1.0 * size / max_size;
-  int count = static_cast<int>(times);
-
-  void *input_address = args.image.address;
-  int type_size =
-      args.input_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
-
-  void *output_address = args.output.address;
-  int out_type_size =
-      args.output_data_type == DATA_TYPE_FP32 ? sizeof(float) : sizeof(int16_t);
-
-  struct BypassArgs bypassArgs = args;
-  bypassArgs.image.width = 1;
-  bypassArgs.image.height = 1;
-
-  // std::cout << "times:" << times << " count:" << count << std::endl;
-
-  for (int i = 0; i < count; ++i) {
-    bypassArgs.image.channels = max_size;
-    bypassArgs.image.address =
-        reinterpret_cast<char *>(input_address + i * max_size * type_size);
-    bypassArgs.output.address =
-        reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-    if (ret != 0) {
-      return ret;
-    }
-    // std::cout << "@:" << i << " ret:" << ret << std::endl;
-  }
-
-  int remainder = size - max_size * count;
-  // std::cout << "remainder:" << remainder << std::endl;
-  bypassArgs.image.channels = remainder;
-  bypassArgs.image.address =
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
-  bypassArgs.output.address = reinterpret_cast<char *>(
-      output_address + count * max_size * out_type_size);
-  return do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-}
-
-int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
-
-int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
-  return do_ioctl(IOCTL_CONFIG_SCALE, &args);
-}
-
-int compute_fpga_dwconv(const struct DWconvArgs &args) {
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-  // float *in_scale = (float *)args.image.scale_address;
-  // std::cout << "inv_scale:" << in_scale[0] << "," << in_scale[1] <<
-  // std::endl;
-
-  return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
-}
-
-// int config_power(const struct PowerArgs& args) {
-//     return do_ioctl(IOCTL_CONFIG_POWER, &args);
-// }
-
-int config_inplace(const struct InplaceArgs &args) {
-  return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
-}
-
-// uint64_t vaddr_to_paddr(void *address) {
-//     return 0;
-// }
-
-int16_t fp32_2_fp16(float fp32_num) {
-  unsigned long tmp = *(unsigned long *)(&fp32_num);  // NOLINT
-  auto t = (int16_t)(((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) |
-                     (((tmp & 0x7f800000) >> 13) - (112 << 10)));
-  if (tmp & 0x1000) {
-    t++;  // roundoff
-  }
-  return t;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  if (0 == fp16_num) {
-    return 0;
-  }
-  int frac = (fp16_num & 0x3ff);
-  int exp = ((fp16_num & 0x7c00) >> 10) + 112;
-  int s = fp16_num & 0x8000;
-  int tmp = 0;
-  float fp32_num = 0;
-  tmp = s << 16 | exp << 23 | frac << 13;
-  fp32_num = *(float *)&tmp;  // NOLINT
-  return fp32_num;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/llapi/zynqmp_api.h b/mobile/src/fpga/KD/llapi/zynqmp_api.h
deleted file mode 100644
index 89d975490338e107681f5eb016fe50a3a5a628a5..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/llapi/zynqmp_api.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
-#define PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
-
-#include <stdint.h>
-#include <cstddef>
-#include <iostream>
-#include <limits>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-typedef int16_t half;
-
-#define IMAGE_ALIGNMENT 16           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT 32      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT 16  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT 8
-#define BIAS_NUM_ALIGNMENT 16
-
-enum DDataType {
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum DLayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-struct VersionArgs {
-  void* buffer;
-};
-
-struct MemoryCopyArgs {
-  void* src;
-  void* dest;
-  size_t size;
-};
-
-struct MemoryCacheArgs {
-  void* address;
-  size_t size;
-};
-
-struct MemoryBarrierArgs {};
-
-struct BNArgs {
-  bool enabled;
-  void* bias_address;
-  void* scale_address;
-};
-
-/**
-Conv and Pooling kernel
-*/
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;        // input featuremap virtual address
-  void* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias are interlaced;
-  void* filter_address;
-  void* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct DWconvArgs {
-  bool relu_enabled;
-  void* bias_address;
-  void* filter_address;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-  uint16_t out_width;
-  uint16_t out_height;
-  uint16_t sub_conv_num;
-};
-
-struct PoolingArgs {
-  uint16_t mode;
-  uint16_t kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-  uint16_t out_width;
-  uint16_t out_height;
-};
-
-// elementwise add arguments
-struct EWAddArgs {
-  bool relu_enabled;
-
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-};
-
-struct BypassArgs {
-  enum DDataType input_data_type;
-  enum DDataType output_data_type;
-  enum DLayoutType input_layout_type;
-  enum DLayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct ScaleArgs {
-  void* scale_address;
-  void* bias_address;
-  uint32_t wc_alignment;
-  uint32_t channel_alignment;
-
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct NormalizeArgs {
-  void* input_image_address;
-  void* output_image_address;
-  uint32_t image_width;
-  uint32_t image_height;
-  uint32_t image_channel;
-  uint32_t* output_scale_address;
-};
-
-struct PowerParameterArgs {
-  uint16_t shift;
-  uint16_t scale;
-  uint16_t power;
-};
-
-struct NormalizeParameterArgs {
-  uint32_t channel;
-  uint32_t hight_width;
-};
-
-struct InplaceArgs {
-  bool relu_enable;
-  bool power_enable;
-  bool normalize_enable;
-};
-
-struct FpgaRegWriteArgs {
-  uint64_t address;  //
-  uint64_t value;
-};
-
-struct FpgaRegReadArgs {
-  uint64_t address;
-  uint64_t value;
-};
-
-struct FpgaResetArgs {};
-
-#define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
-
-#define IOCTL_VERSION _IOW(IOCTL_FPGA_MAGIC, 01, struct VersionArgs)
-
-#define IOCTL_SEPARATOR_0 10
-
-#define IOCTL_MEM_COPY _IOW(IOCTL_FPGA_MAGIC, 11, struct MemoryCopyArgs)
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-#define IOCTL_MEMORY_BARRIER \
-  _IOW(IOCTL_FPGA_MAGIC, 14, struct MemoryBarrierArgs)
-
-#define IOCTL_SEPARATOR_1 20
-
-#define IOCTL_CONFIG_CONV _IOW(IOCTL_FPGA_MAGIC, 21, struct ConvArgs)
-#define IOCTL_CONFIG_POOLING _IOW(IOCTL_FPGA_MAGIC, 22, struct PoolingArgs)
-#define IOCTL_CONFIG_EW _IOW(IOCTL_FPGA_MAGIC, 23, struct EWAddArgs)
-#define IOCTL_CONFIG_BYPASS _IOW(IOCTL_FPGA_MAGIC, 24, struct BypassArgs)
-#define IOCTL_CONFIG_SCALE _IOW(IOCTL_FPGA_MAGIC, 25, struct ScaleArgs)
-#define IOCTL_CONFIG_NORMALIZE _IOW(IOCTL_FPGA_MAGIC, 26, struct NormalizeArgs)
-
-#define IOCTL_CONFIG_DWCONV _IOW(IOCTL_FPGA_MAGIC, 31, struct DWconvArgs)
-
-#define IOCTL_CONFIG_INPLACE _IOW(IOCTL_FPGA_MAGIC, 40, struct InplaceArgs)
-#define IOCTL_CONFIG_POWER_PARAMETER \
-  _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
-#define IOCTL_CONFIG_NORMALIZE_PARAMETER \
-  _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
-#define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
-#define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
-#define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
-
-//============================== API =============================
-
-// struct DWconvArgs {
-//     bool relu_enabled;
-//     void* bias_address;
-//     void* filter_address;
-//     struct KernelArgs kernel;
-//     struct ImageInputArgs image;
-//     struct ImageOutputArgs output;
-// };
-
-struct DeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* split_conv_args;
-};
-
-struct SplitArgs {
-  uint32_t image_num;
-  int16_t* image_in;
-  float* scale_in;
-  void** images_out;
-  float** scales_out;
-  uint32_t* out_channel_nums;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-  half** images_in;
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t height;
-  uint32_t width;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-};
-
-struct GroupConvArgs {
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct SplitConvArgs* conv_args;
-  struct ConcatArgs concat_arg;
-};
-
-inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
-int open_device();
-void close_device();
-
-void reset_device();
-
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-size_t fpga_get_memory_size(void* ptr);
-size_t fpga_get_memory_size_max();
-size_t fpga_diagnose_memory(int detailed);
-
-void fpga_copy(void* dst, const void* src, int size);
-
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-int perform_bypass(const struct BypassArgs& args);
-int compute_fpga_conv_basic(const struct ConvArgs& args);
-int compute_fpga_conv(const struct SplitConvArgs& args);
-int compute_fpga_pool(const struct PoolingArgs& args);
-int compute_fpga_ewadd(const struct EWAddArgs& args);
-int compute_fpga_scale(const struct ScaleArgs& args);
-int compute_fpga_concat(const struct ConcatArgs& args);
-int config_power(const struct PowerArgs& args);
-int compute_fpga_dwconv(const struct DWconvArgs& args);
-
-// int config_relu(const struct ReluArgs& args);
-
-int config_inplace(const struct InplaceArgs& args);
-
-int flush_cache(void* addr, int size);
-int invalidate_cache(void* addr, int size);
-
-int16_t fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(int16_t fp16_num);
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif  // PADDLE_MOBILE_SRC_FPGA_KD_ZYNQMP_API_H
diff --git a/mobile/src/fpga/KD/pe.hpp b/mobile/src/fpga/KD/pe.hpp
deleted file mode 100644
index e2be6b3610f229f0dd67049f8d165aaa404d448c..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pe.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PE_hpp
-#define PE_hpp
-
-#include <stdio.h>
-#include <iostream>
-#include "pe_params.hpp"
-#include "tensor_util.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class PE {
- public:
-  virtual bool init() { return false; }
-
-  virtual void apply() {}
-
-  virtual bool dispatch() {
-    std::cout << "pe dispatch \n";
-    return false;
-  }
-
-  virtual ~PE() {}
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* PE_hpp */
diff --git a/mobile/src/fpga/KD/pe_params.hpp b/mobile/src/fpga/KD/pe_params.hpp
deleted file mode 100644
index f9a495fad8bcefb1fe9d68b43bfc50011ef11e70..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pe_params.hpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef PEParams_hpp
-#define PEParams_hpp
-
-#include <stdio.h>
-#include <vector>
-
-#include "llapi/zynqmp_api.h"
-#include "tensor.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-struct PEParam {};
-
-struct InputParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct OutputParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-};
-
-struct ReLUParam : PEParam {
- public:
-  bool enabled = false;
-};
-
-struct BatchnormParam : PEParam {
- public:
-  Tensor* bias = nullptr;
-  Tensor* scale = nullptr;
-  Tensor* mean = nullptr;
-  Tensor* variance = nullptr;
-  float epsilon = 0;
-};
-
-struct BasicConvParam {
-  Tensor output;
-  Tensor filter;
-  Tensor scaleBias;
-  ConvArgs args;
-};
-
-struct ConvParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-  Tensor* filter = nullptr;
-  BatchnormParam* batchnorm = nullptr;
-  ReLUParam relu;
-  int groups = 1;
-  std::vector<int> strides;
-  std::vector<int> paddings;
-  std::vector<int> kernelSize;
-  std::vector<int> dilations;
-
-  Tensor* scale() { return scale_; }
-
-  Tensor* bias() { return bias_; }
-
-  // Tensor* quantizedFilter() {
-  //     return quantizedFilter_;
-  // }
-
-  std::vector<BasicConvParam*>& splitParams() { return splitParams_; }
-
- protected:
-  std::vector<BasicConvParam*> splitParams_;
-  // Tensor* quantizedFilter_ = new Tensor();
-  Tensor* scale_ = new Tensor();
-  Tensor* bias_ = new Tensor();
-};
-
-struct DepthwiseConvParam : ConvParam {
- public:
-  Tensor* quantizedFilter() { return quantizedFilter_; }
-
-  DWconvArgs args;
-
- protected:
-  Tensor* quantizedFilter_ = new Tensor();
-};
-
-enum PoolingType : int {
-  MAX = 0,
-  AVERAGE = 1,
-};
-
-struct PoolingParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* output = nullptr;
-
-  PoolingType type = PoolingType::MAX;
-  bool globalPooling = false;
-  std::vector<int> kernelSize;
-  std::vector<int> strides;
-  std::vector<int> paddings;
-
-  PoolingArgs poolingArgs = {0};
-};
-
-struct ConcatParam : PEParam {
- public:
-  std::vector<Tensor*> inputs;
-  Tensor* output;
-  int axis = 0;
-};
-
-struct ElementwiseAddParam : PEParam {
- public:
-  std::vector<Tensor*> inputs;
-  Tensor* output = nullptr;
-  int axis = 0;
-  ReLUParam relu;
-
-  EWAddArgs ewargs;
-};
-
-struct FullyConnectedParam : PEParam {
- public:
-  Tensor* input = nullptr;
-  Tensor* filter = nullptr;
-  Tensor* bias = nullptr;
-  Tensor* output = nullptr;
-
-  Tensor* quantizedFilter() { return quantizedFilter_; }
-
-  Tensor* biasScale() { return biasScale_; }
-
-  SplitConvArgs convArgs;
-
- protected:
-  Tensor* quantizedFilter_ = new Tensor();
-  Tensor* biasScale_ = new Tensor();
-};
-
-struct SoftmaxParam : PEParam {
- public:
-  Tensor* input = nullptr;
-
-  Tensor* output = nullptr;
-
- private:
-  Tensor* floatInput = nullptr;
-};
-struct NormParam : PEParam {
- public:
-  Tensor* input = nullptr;
-
-  Tensor* output = nullptr;
-
- private:
-  Tensor* floatInput = nullptr;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* PEParams_hpp */
diff --git a/mobile/src/fpga/KD/pes/concat_pe.hpp b/mobile/src/fpga/KD/pes/concat_pe.hpp
deleted file mode 100644
index 54169ad5d29bccab0656778f30de316b87f529cb..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/concat_pe.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class ConcatPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    return true;
-  }
-
-  void apply() {}
-
-  bool dispatch() {
-    Tensor* output = param_.output;
-    Shape& output_shape = output->shape();
-    float16* out_data = param_.output->data<float16>();
-
-    int channel_sum = 0;
-    int out_channel = output_shape.channel();
-    float scale = 0;
-    for (int n = 0; n < param_.inputs.size(); n++) {
-      Tensor* input = param_.inputs[n];
-      input->invalidate();
-      scale = std::max(scale, input->scale()[0]);
-      Shape& input_shape = input->shape();
-      int wh = output_shape.width() * output_shape.height();
-      for (int j = 0; j < wh; j++) {
-        float16* src = input->data<float16>() + j * input_shape.channel();
-        memcpy(out_data + j * out_channel + channel_sum, src,
-               input_shape.channel() * sizeof(float16));
-      }
-      channel_sum += input_shape.channel();
-    }
-    output->scale()[0] = scale;
-    output->scale()[1] = 1.0f / scale;
-    std::cout << "conv scale::" << scale << std::endl;
-    output->flush();
-    return true;
-  }
-
-  ConcatParam& param() { return param_; }
-
- private:
-  ConcatParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/conv_pe.hpp b/mobile/src/fpga/KD/pes/conv_pe.hpp
deleted file mode 100644
index 5ef89e920e60cd2ef1c57e1f342a342a4149563f..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/conv_pe.hpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "../llapi/image.h"
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-#include "concat_pe.hpp"
-#include "conv_pe.hpp"
-#include "conv_process.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class ConvPE : public PE {
- public:
-  bool init() {
-    std::cout << "Conv init" << std::endl;
-    return true;
-  }
-
-  void apply() {
-    // process scale and bias;
-    BatchnormParam* bn = param_.batchnorm;
-    int channel = param_.output->shape().channel();
-    Shape sb_shape(N, {channel});
-    float* new_scale_ptr = param_.scale()->mutableData<float>(FP32, sb_shape);
-    float* new_bias_ptr = param_.bias()->mutableData<float>(FP32, sb_shape);
-    if (bn != nullptr) {
-      float* bn_scale_ptr = bn->scale->data<float>();
-      float* bn_bias_ptr = bn->bias->data<float>();
-      float* bn_var_ptr = bn->variance->data<float>();
-      float* bn_mean_ptr = bn->mean->data<float>();
-      float epsilon = bn->epsilon;
-      for (int i = 0; i < channel; i++) {
-        float new_scale =
-            bn_scale_ptr[i] /
-            static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-        new_scale_ptr[i] = new_scale;
-        new_bias_ptr[i] =
-            bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-      }
-    } else {
-      for (int i = 0; i < channel; i++) {
-        new_scale_ptr[i] = 1.0f;
-        new_bias_ptr[i] = 0.0f;
-      }
-    }
-    fill_split_arg(param_);
-    if (param_.splitParams().size() > 1) {
-      ConcatParam& concat_param = concatPE_.param();
-      for (auto conv_param : param_.splitParams()) {
-        concat_param.inputs.push_back(&conv_param->output);
-      }
-      concat_param.output = param_.output;
-      concatPE_.init();
-      concatPE_.apply();
-    }
-  }
-
-  bool dispatch() {
-    std::vector<BasicConvParam*>& params = param_.splitParams();
-    int ret = 0;
-    for (auto conv_param : params) {
-      ret |= compute_fpga_conv_basic(conv_param->args);
-    }
-    size_t size = params.size();
-    if (ret == 0 && size > 1) {
-      concatPE_.dispatch();
-    }
-    return ret == 0;
-  }
-
-  ConvParam& param() { return param_; }
-
- private:
-  ConvParam param_;
-  ConcatPE concatPE_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/conv_process.hpp b/mobile/src/fpga/KD/pes/conv_process.hpp
deleted file mode 100644
index 13bcaccabdb0eb748f98b7390cd5fb188e7bc571..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/conv_process.hpp
+++ /dev/null
@@ -1,374 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifndef conv_process_hpp
-#define conv_process_hpp
-
-#include <string.h>
-#include <cmath>
-#include <vector>
-
-#include "../float16.hpp"
-#include "../llapi/bias_scale.h"
-#include "../llapi/filter.h"
-#include "../llapi/image.h"
-#include "../tensor.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-inline int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-inline int get_filter_num_per_div(Tensor* filter, int group_num) {
-  auto chw = filter->shape().channel() * filter->shape().height() *
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-inline int get_split_num(Tensor* filter) {
-  auto chw = filter->shape().channel() * filter->shape().height() *
-             filter->shape().width();
-  auto num = filter->shape().num();
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-inline void format_scale_bias(Tensor* scale, Tensor* bias, Tensor* filter,
-                              Tensor* scale_bias, int group) {
-  float* scale_data = nullptr;
-  float* bias_data = nullptr;
-  if (scale != nullptr) {
-    scale_data = scale->data<float>();
-  }
-  if (bias != nullptr) {
-    bias_data = bias->data<float>();
-  }
-  int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
-  float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
-  }
-
-  int element_num_per_div = get_filter_num_per_div(filter, group);
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
-}
-
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
-  float max_value = find_max(*filter);
-  Shape& filter_shape = filter->shape();
-  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  quantized_filter->scale()[0] = max_value / 127.0f;
-  quantized_filter->scale()[1] = 127.0f / max_value;
-
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(
-      &new_data, filter_shape.num(), filter_shape.channel(),
-      filter_shape.height(), filter_shape.width(), group, max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
-  quantized_filter->flush();
-}
-
-inline void format_dw_filter(Tensor* filter, Tensor* quantized_filter,
-                             float* scale) {
-  int num = filter->shape().num();
-  int height = filter->shape().height();
-  int width = filter->shape().width();
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, filter->data<float>(), memory_size);
-
-  filter::format_dwconv_filter(&new_data, num, height, width, scale);
-  float16* src = quantized_filter->mutableData<float16>(FP16, filter->shape());
-  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(float16)));
-  quantized_filter->flush();
-
-  fpga_free(new_data);
-}
-
-inline void format_fc_filter(Tensor* filter, Tensor* quantized_filter) {
-  float max_value = find_max(*filter);
-  Shape& filter_shape = filter->shape();
-  quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  quantized_filter->scale()[0] = max_value / 127.0f;
-  quantized_filter->scale()[1] = 127.0f / max_value;
-
-  size_t memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = (float*)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, filter->data<float>(), memory_size);
-  filter::format_fc_filter(&new_data, filter_shape.num(),
-                           filter_shape.channel(), filter_shape.height(),
-                           filter_shape.width(), 1, max_value);
-
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, quantized_filter->shape().memorySize(sizeof(int8_t)));
-  quantized_filter->flush();
-  fpga_free(new_data);
-}
-
-inline void fill_split_arg(const ConvParam& c_param) {
-  ConvParam& param = const_cast<ConvParam&>(c_param);
-  Tensor* input = param.input;
-  Tensor* out = param.output;
-  Tensor* filter = param.filter;
-  auto channel = out->shape().channel();
-
-  int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
-  int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
-  int element_num = get_aligned_filter_element_num(filter->shape().channel() *
-                                                   filter->shape().height() *
-                                                   filter->shape().width());
-
-  Shape& out_shape = out->shape();
-  for (int i = 0; i < split_num; i++) {
-    BasicConvParam* conv_param = new BasicConvParam();
-
-    int filter_num = filter->shape().num();
-    float16* out_address = nullptr;
-    int8_t* filter_address = nullptr;
-    float* sb_address = nullptr;
-    float* out_scale_address = nullptr;
-
-    ConvArgs& args = conv_param->args;
-
-    if (split_num == 1) {
-      out_address = out->data<float16>();
-      out_scale_address = out->scale();
-    }
-    filter_num = i == split_num - 1
-                     ? channel - (split_num - 1) * filter_num_per_div  // NOLINT
-                     : filter_num_per_div;
-    if (split_num != 1) {
-      Shape shape(NHWC, {1, out_shape.height(), out_shape.width(), filter_num});
-      out_address = conv_param->output.mutableData<float16>(FP16, shape);
-      out_scale_address = conv_param->output.scale();
-    }
-    Shape f_shape(NCHW, {filter_num, filter->shape().channel(),
-                         filter->shape().height(), filter->shape().width()});
-
-    Tensor new_filter;
-    float* new_filter_data = new_filter.mutableData<float>(FP32, f_shape);
-    int filter_hwc = filter->shape().height() * filter->shape().width() *
-                     filter->shape().channel();
-    memcpy(new_filter_data,
-           filter->data<float>() + i * filter_num_per_div * filter_hwc,
-           filter_num * filter_hwc * sizeof(float));
-    new_filter.flush();
-    conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
-    filter_address = conv_param->filter.data<int8_t>();
-    std::cout << conv_param->filter.scale()[0] << std::endl;
-    args.filter_scale_address = conv_param->filter.scale();
-
-    int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
-    Tensor scale;
-    Tensor bias;
-
-    int chnnnel_start = i * filter_num_per_div;
-
-    Shape s_shape(N, {filter_num});
-    float* scale_data = scale.mutableData<float>(FP32, s_shape);
-    float* bias_data = bias.mutableData<float>(FP32, s_shape);
-    for (int i = 0; i < filter_num; i++) {
-      scale_data[i] = param.scale()->data<float>()[i + chnnnel_start];
-    }
-    for (int i = 0; i < filter_num; i++) {
-      // bias_data[i] = 0.0f;//TODO
-      bias_data[i] = param.bias()->data<float>()[i + chnnnel_start];
-    }
-    Shape sb_shape(N, {sb_num});
-    format_scale_bias(&scale, &bias, &conv_param->filter,
-                      &conv_param->scaleBias, param.groups);
-    sb_address = conv_param->scaleBias.mutableData<float>(FP32, sb_shape);
-
-    args.group_num = param.groups;
-    args.relu_enabled = param.relu.enabled;
-    args.sb_address = sb_address;
-    args.kernel.stride_h = param.strides[1];
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.height = new_filter.shape().height();
-    args.kernel.width = new_filter.shape().width();
-
-    args.filter_address = filter_address;
-    args.filter_num = filter_num;
-
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = input->shape().channel();
-    args.image.width = input->shape().width();
-    args.image.height = input->shape().height();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
-
-    args.output.address = out_address;
-    args.output.scale_address = out_scale_address;
-    param.splitParams().push_back(conv_param);
-  }
-}
-
-inline void fill_split_arg(struct SplitConvArgs* arg, Tensor* input,
-                           Tensor* out, Tensor* filter, bool relu_enabled,
-                           int group_num, int stride_h, int stride_w,
-                           int padding_h, int padding_w, float* bs_ptr) {
-  auto input_ptr = input->data<float>();
-  auto filter_ptr = filter->data<float>();
-  auto out_ptr = out->data<float>();
-
-  arg->group_num = (uint32_t)group_num;
-  arg->split_num = group_num == 1 ? get_split_num(filter) : 1;
-  arg->filter_num = filter->shape().num();
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale();
-  arg->conv_arg =
-      (ConvArgs*)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
-
-  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
-
-  arg->concat_arg.image_num = arg->split_num;
-  arg->concat_arg.image_out = out_ptr;
-  arg->concat_arg.scale_out = out->scale();
-  arg->concat_arg.height = out->shape().height();
-  arg->concat_arg.width = out->shape().width();
-
-  int n = arg->split_num;
-  arg->concat_arg.images_in = (half**)fpga_malloc(n * sizeof(int*));  // NOLINT
-  arg->concat_arg.scales_in =
-      (float**)fpga_malloc(n * sizeof(float*));  // NOLINT
-  arg->concat_arg.channel_num =
-      (uint32_t*)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
-
-  auto channel = out->shape().channel();
-  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = get_aligned_filter_element_num(filter->shape().channel() *
-                                                   filter->shape().height() *
-                                                   filter->shape().width());
-
-  for (int i = 0; i < n; i++) {
-    arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_arg[i].kernel.height = filter->shape().height();
-    arg->conv_arg[i].kernel.width = filter->shape().width();
-    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.channels = input->shape().channel();
-    arg->conv_arg[i].image.height = input->shape().height();
-    arg->conv_arg[i].image.width = input->shape().width();
-    arg->conv_arg[i].image.scale_address = input->scale();
-    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_arg[i].filter_scale_address = filter->scale();
-    arg->conv_arg[i].filter_num = (uint32_t)(
-        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                   : filter_num_per_div);
-
-    size_t filter_size =
-        element_num *
-        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
-        sizeof(int8_t);
-    auto filter_head =
-        &((int8_t*)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
-    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
-    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
-
-    size_t bs_size = 2 *
-                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
-                     sizeof(float);
-    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
-    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
-    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
-
-    if (n > 1) {
-      arg->conv_arg[i].output.scale_address =
-          (float*)fpga_malloc(2 * sizeof(float));  // NOLINT
-      arg->conv_arg[i].output.address = fpga_malloc(
-          out->shape().height() *
-          align_to_x(out->shape().width() * arg->conv_arg[i].filter_num,
-                     IMAGE_ALIGNMENT) *
-          sizeof(half));
-    } else {
-      arg->conv_arg[i].output.scale_address = out->scale();
-      arg->conv_arg[i].output.address = out_ptr;
-    }
-
-    arg->concat_arg.images_in[i] =
-        (half*)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
-  }
-}
-
-inline int do_concat(const struct ConcatArgs& args) {
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}
-
-inline bool compute_conv(const ConvParam& c_conv_params) {
-  ConvParam& conv_params = const_cast<ConvParam&>(c_conv_params);
-  std::vector<BasicConvParam*>& params = conv_params.splitParams();
-  int ret = 0;
-  for (auto conv_param : params) {
-    ret |= compute_fpga_conv_basic(conv_param->args);
-  }
-  size_t size = params.size();
-  if (ret == 0 && size > 1) {
-    Tensor* output = conv_params.output;
-
-    Tensor& img = params[0]->output;
-    for (int i = 0; i < 1; i++) {
-      for (int i = 0; i < img.shape().numel(); i++) {
-        float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
-      }
-    }
-  }
-  return ret == 0;
-}
-
-inline bool compute_conv(const SplitConvArgs& args) {
-  int ret = 0;
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ret |= compute_fpga_conv_basic(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    do_concat(args.concat_arg);
-  }
-  return ret == 0;
-}
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
-
-#endif /* conv_process_hpp */
diff --git a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp b/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
deleted file mode 100644
index 43dbb4f4a1802c39e6a347f482bd9326b431a33c..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/depthwise_conv_pe.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../float16.hpp"
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-#include "conv_process.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class DepthwiseConvPE : public PE {
- public:
-  bool init() {
-    std::cout << "DWConv init" << std::endl;
-    return true;
-  }
-
-  void apply() {
-    DepthwiseConvParam& param = param_;
-    Tensor* input = param.input;
-    Tensor* output = param.output;
-    int channel = output->shape().channel();
-
-    Tensor* new_scale = param.scale();
-    Tensor* new_bias = param.bias();
-    Shape shape(NC, {channel, 1});
-    float* new_scale_data = new_scale->mutableData<float>(FP32, shape);
-    float16* new_bias_data = new_bias->mutableData<float16>(FP16, shape);
-
-    BatchnormParam* batchnorm = param.batchnorm;
-    memset(new_scale_data, 0, new_scale->shape().memorySize(sizeof(float16)));
-    memset(new_bias_data, 0, new_bias->shape().memorySize(sizeof(float16)));
-    if (batchnorm != nullptr) {
-      for (size_t i = 0; i < channel; i++) {
-        // TODO(chonwhite) combine;
-      }
-    } else {
-      float16 zero = float_to_half(0.0f);
-      for (size_t i = 0; i < channel; i++) {
-        new_bias_data[i] = zero;
-        new_scale_data[i] = 1.0f;
-      }
-    }
-
-    Tensor* quantized_filter = param.quantizedFilter();
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
-
-    DWconvArgs args = {0};
-
-    void* filter_address = quantized_filter->data<float>();
-    std::cout << "filter:" << filter_address;
-
-    args.bias_address = new_bias_data;
-    args.filter_address = param.quantizedFilter()->data<void>();
-    args.kernel.width = param.kernelSize[0];
-    args.kernel.height = param.kernelSize[1];
-    args.kernel.stride_w = param.strides[0];
-    args.kernel.stride_h = param.strides[1];
-    args.image.address = input->data<void>();
-    args.image.channels = input->shape().channel();
-    args.image.height = input->shape().height();
-    args.image.width = input->shape().width();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
-    args.image.scale_address = input->scale();
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
-    args.out_width = param.output->shape().width();
-    args.out_height = param.output->shape().height();
-    args.sub_conv_num = 1;
-    param.args = args;
-  }
-
-  bool dispatch() { return compute_fpga_dwconv(param_.args) == 0; }
-
-  DepthwiseConvParam& param() { return param_; }
-
- private:
-  DepthwiseConvParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp b/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
deleted file mode 100644
index c4fab49a3dcd3e848ad178f722090fbfbe1166f9..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/elementwise_add_pe.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class ElementwiseAddPE : public PE {
- public:
-  bool init() { return true; }
-
-  void apply() {
-    Tensor* input0 = param_.inputs[0];
-    Tensor* input1 = param_.inputs[1];
-    Tensor* output = param_.output;
-    EWAddArgs args = {0};
-    args.const0 = 0x3c00;
-    args.const1 = 0x3c00;  // =1
-    args.image0.address = input0->data<float16>();
-    args.image0.channels = input0->shape().channel();
-    args.image0.scale_address = input0->scale();
-    args.image0.height = input0->shape().height();
-    args.image0.width = input0->shape().width();
-    args.image0.pad_height = 0;
-    args.image0.pad_width = 0;
-    args.image1.address = input1->data<float16>();
-    args.image1.channels = input1->shape().channel();
-    args.image1.scale_address = input1->scale();
-    args.image1.height = input1->shape().height();
-    args.image1.width = input1->shape().width();
-    args.image1.pad_height = 0;
-    args.image1.pad_width = 0;
-    args.output.scale_address = output->scale();
-    args.output.address = output->data<float16>();
-    param_.ewargs = args;
-  }
-
-  bool dispatch() {
-    InplaceArgs inplace_args = {0};
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = true;
-      config_inplace(inplace_args);
-    }
-    compute_fpga_ewadd(param_.ewargs);
-    if (param_.relu.enabled) {
-      inplace_args.relu_enable = false;
-      config_inplace(inplace_args);
-    }
-    return true;
-  }
-
-  ElementwiseAddParam& param() { return param_; }
-
- private:
-  ElementwiseAddParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp b/mobile/src/fpga/KD/pes/fully_connected_pe.hpp
deleted file mode 100644
index 0082cf0aa9c28f601b5a8c3283c05b262bf765e2..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/fully_connected_pe.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-#include "conv_process.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class FullyConnectedPE : public PE {
- public:
-  bool init() { return true; }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    convParam_.input = param_.input;
-    convParam_.output = param_.output;
-    // convParam_.relu = param_.relu;
-    convParam_.groups = 1;
-    convParam_.strides = {1, 1};
-    convParam_.paddings = {0, 0};
-    convParam_.kernelSize = {input->shape().width(), input->shape().height()};
-    convParam_.dilations = {1, 1};
-
-    int num = param_.filter->shape().channel();
-    int chw = param_.filter->shape().num();
-
-    int height = param_.input->shape().height();
-    int width = param_.input->shape().width();
-    int filter_channel = chw / height / width;
-
-    int channel = param_.output->shape().channel();
-    Shape shape(NCHW, {num, filter_channel, height, width});
-    Tensor* conv_filter = new Tensor();
-    float* new_filter_data = conv_filter->mutableData<float>(FP32, shape);
-    float* filter_data = param_.filter->data<float>();
-
-    for (int i = 0; i < num; i++) {
-      float sum = 0;
-      float* f_start = filter_data + i * chw;
-      for (int j = 0; j < chw; j++) {
-        float scale = filter_data[j * num + i];
-        new_filter_data[i * chw + j] = scale;
-      }
-    }
-
-    conv_filter->flush();
-    convParam_.filter = conv_filter;
-
-    Shape sb_shape(N, {channel});
-    float* scale_data = convParam_.scale()->mutableData<float>(FP32, sb_shape);
-    float* bias_data = convParam_.bias()->mutableData<float>(FP32, sb_shape);
-
-    for (int i = 0; i < channel; i++) {
-      scale_data[i] = 1.0f;
-      bias_data[i] = param_.bias->data<float>()[i];
-    }
-
-    fill_split_arg(convParam_);
-  }
-
-  bool dispatch() {
-    int ret = 0;
-    std::vector<BasicConvParam*>& params = convParam_.splitParams();
-
-    for (auto conv_param : params) {
-      std::cout << "conv basic \n";
-      ret |= compute_fpga_conv_basic(conv_param->args);
-    }
-    return ret == 0;
-  }
-
-  FullyConnectedParam& param() { return param_; }
-
- private:
-  FullyConnectedParam param_;
-  ConvParam convParam_;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/input_pe.hpp b/mobile/src/fpga/KD/pes/input_pe.hpp
deleted file mode 100644
index ad3187c1f92fdb86a4c6e348f60c20d9efc45bb1..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/input_pe.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-namespace paddle_mobile {
-namespace zynqmp {
-
-class InputPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    return true;
-  }
-
-  bool dispatch() {
-    std::cout << "InputPE dispatch \n";
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    Tensor* src = input;
-    Tensor half_tensor;
-    if (input->dataType() == DataType::FP32) {
-      half_tensor.mutableData<void*>(DataType::FP16, input->shape());
-      half_tensor.copyFrom(input);
-      src = &half_tensor;
-    }
-    output->mutableData<void>();
-    src->alignImage(output, true);
-    return true;
-  }
-
-  InputParam& param() { return param_; }
-
- private:
-  InputParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/math_func_neon.h b/mobile/src/fpga/KD/pes/math_func_neon.h
deleted file mode 100755
index f34e30036c726965734ca4c065f6337ab4446f02..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/math_func_neon.h
+++ /dev/null
@@ -1,330 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* NEON implementation of sin, cos, exp and log
- *
- *   Inspired by Intel Approximate Math library, and based on the
- *   corresponding algorithms of the cephes math library
- */
-
-/* Copyright (C) 2011  Julien Pommier
- *
- *  This software is provided 'as-is', without any express or implied
- *  warranty.  In no event will the authors be held liable for any damages
- *  arising from the use of this software.
- *
- *  Permission is granted to anyone to use this software for any purpose,
- *  including commercial applications, and to alter it and redistribute it
- *  freely, subject to the following restrictions:
- *
- *  1. The origin of this software must not be misrepresented; you must not
- *     claim that you wrote the original software. If you use this software
- *     in a product, an acknowledgment in the product documentation would be
- *     appreciated but is not required.
- *  2. Altered source versions must be plainly marked as such, and must not be
- *     misrepresented as being the original software.
- *  3. This notice may not be removed or altered from any source distribution.
- *
- *  (this is the zlib license)
- */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#pragma once
-
-#include <arm_neon.h>
-
-static const int32_t c_inv_mant_mask = ~0x7f800000u;
-static const float c_cephes_SQRTHF = 0.707106781186547524;
-static const float c_cephes_log_p0 = 7.0376836292E-2;
-static const float c_cephes_log_p1 = -1.1514610310E-1;
-static const float c_cephes_log_p2 = 1.1676998740E-1;
-static const float c_cephes_log_p3 = -1.2420140846E-1;
-static const float c_cephes_log_p4 = +1.4249322787E-1;
-static const float c_cephes_log_p5 = -1.6668057665E-1;
-static const float c_cephes_log_p6 = +2.0000714765E-1;
-static const float c_cephes_log_p7 = -2.4999993993E-1;
-static const float c_cephes_log_p8 = +3.3333331174E-1;
-static const float c_cephes_log_q1 = -2.12194440e-4;
-static const float c_cephes_log_q2 = 0.693359375;
-
-/* natural logarithm computed for 4 simultaneous float
- *   return NaN for x <= 0
- */
-static inline float32x4_t log_ps(float32x4_t x) {
-  float32x4_t one = vdupq_n_f32(1);
-
-  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
-  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
-  int32x4_t ux = vreinterpretq_s32_f32(x);
-
-  int32x4_t emm0 = vshrq_n_s32(ux, 23);
-
-  /* keep only the fractional part */
-  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
-  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
-  x = vreinterpretq_f32_s32(ux);
-
-  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
-  float32x4_t e = vcvtq_f32_s32(emm0);
-
-  e = vaddq_f32(e, one);
-
-  /* part2:
-   *     if( x < SQRTHF ) {
-   *       e -= 1;
-   *       x = x + x - 1.0;
-   *     } else { x = x - 1.0; }
-   */
-  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
-  float32x4_t tmp =
-      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-  x = vsubq_f32(x, one);
-  e = vsubq_f32(
-      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
-  x = vaddq_f32(x, tmp);
-
-  float32x4_t z = vmulq_f32(x, x);
-
-  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
-  y = vmulq_f32(y, x);
-
-  y = vmulq_f32(y, z);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
-  y = vaddq_f32(y, tmp);
-
-  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
-  y = vsubq_f32(y, tmp);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
-  x = vaddq_f32(x, y);
-  x = vaddq_f32(x, tmp);
-  x = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
-  return x;
-}
-
-static const float c_exp_hi = 88.3762626647949f;
-static const float c_exp_lo = -88.3762626647949f;
-
-static const float c_cephes_LOG2EF = 1.44269504088896341;
-static const float c_cephes_exp_C1 = 0.693359375;
-static const float c_cephes_exp_C2 = -2.12194440e-4;
-
-static const float c_cephes_exp_p0 = 1.9875691500E-4;
-static const float c_cephes_exp_p1 = 1.3981999507E-3;
-static const float c_cephes_exp_p2 = 8.3334519073E-3;
-static const float c_cephes_exp_p3 = 4.1665795894E-2;
-static const float c_cephes_exp_p4 = 1.6666665459E-1;
-static const float c_cephes_exp_p5 = 5.0000001201E-1;
-
-/* exp() computed for 4 float at once */
-static inline float32x4_t exp_ps(float32x4_t x) {
-  float32x4_t tmp, fx;
-
-  float32x4_t one = vdupq_n_f32(1);
-  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
-  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
-
-  /* perform a floorf */
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  /* if greater, substract 1 */
-  uint32x4_t mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
-  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
-                                        c_cephes_exp_p2, c_cephes_exp_p3,
-                                        c_cephes_exp_p4, c_cephes_exp_p5};
-  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
-  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
-  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
-  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
-  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
-  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
-
-  y = vmulq_f32(y, x);
-  z = vmulq_f32(x, x);
-
-  y = vaddq_f32(y, c1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, one);
-
-  /* build 2^n */
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
-  mm = vshlq_n_s32(mm, 23);
-  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
-}
-
-static const float c_minus_cephes_DP1 = -0.78515625;
-static const float c_minus_cephes_DP2 = -2.4187564849853515625e-4;
-static const float c_minus_cephes_DP3 = -3.77489497744594108e-8;
-static const float c_sincof_p0 = -1.9515295891E-4;
-static const float c_sincof_p1 = 8.3321608736E-3;
-static const float c_sincof_p2 = -1.6666654611E-1;
-static const float c_coscof_p0 = 2.443315711809948E-005;
-static const float c_coscof_p1 = -1.388731625493765E-003;
-static const float c_coscof_p2 = 4.166664568298827E-002;
-static const float c_cephes_FOPI = 1.27323954473516;  // 4 / M_PI
-
-/* evaluation of 4 sines & cosines at once.
- *
- *   The code is the exact rewriting of the cephes sinf function.
- *   Precision is excellent as long as x < 8192 (I did not bother to
- *   take into account the special handling they have for greater values
- *   -- it does not return garbage for arguments over 8192, though, but
- *   the extra precision is missing).
- *
- *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- *   surprising but correct result.
- *
- *   Note also that when you compute sin(x), cos(x) is available at
- *   almost no extra price so both sin_ps and cos_ps make use of
- *   sincos_ps..
- */
-static inline void sincos_ps(float32x4_t x, float32x4_t *ysin,
-                             float32x4_t *ycos) {
-  // any x
-  float32x4_t xmm1, xmm2, xmm3, y;
-
-  uint32x4_t emm2;
-
-  uint32x4_t sign_mask_sin, sign_mask_cos;
-  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
-  x = vabsq_f32(x);
-
-  /* scale by 4/Pi */
-  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
-
-  /* store the integer part of y in mm0 */
-  emm2 = vcvtq_u32_f32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
-  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
-  y = vcvtq_f32_u32(emm2);
-
-  /* get the polynom selection mask
-   *     there is one polynom for 0 <= x <= Pi/4
-   *     and another one for Pi/4<x<=Pi/2
-   *
-   *     Both branches will be computed.
-   */
-  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
-
-  /* The magic pass: "Extended precision modular arithmetic"
-   *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
-  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
-  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
-  x = vaddq_f32(x, xmm1);
-  x = vaddq_f32(x, xmm2);
-  x = vaddq_f32(x, xmm3);
-
-  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
-  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
-   *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
-  float32x4_t z = vmulq_f32(x, x);
-  float32x4_t y1, y2;
-
-  y1 = vmulq_n_f32(z, c_coscof_p0);
-  y2 = vmulq_n_f32(z, c_sincof_p0);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, x);
-  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
-  y2 = vaddq_f32(y2, x);
-  y1 = vaddq_f32(y1, vdupq_n_f32(1));
-
-  /* select the correct result from the two polynoms */
-  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
-  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
-  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-}
-
-static inline float32x4_t sin_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ysin;
-}
-
-static inline float32x4_t cos_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ycos;
-}
-
-static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
-  float32x4_t reciprocal = vrecpeq_f32(b);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  return vmulq_f32(a, reciprocal);
-}
-
-static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  return exp_ps(vmulq_f32(b, log_ps(a)));
-}
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/fpga/KD/pes/output_pe.hpp b/mobile/src/fpga/KD/pes/output_pe.hpp
deleted file mode 100644
index 92757db815b85c77af61fac80c2eefc46c875c5e..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/output_pe.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class OutputPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(false);
-    return true;
-  }
-
-  bool dispatch() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-    Tensor* src_tensor = input;
-    Tensor float_tensor;
-    input->invalidate();
-    float_tensor.mutableData<float>(DataType::FP32, input->shape());
-    if (input->dataType() == DataType::FP16) {
-      float_tensor.copyFrom(input);
-      src_tensor = &float_tensor;
-    }
-    src_tensor->unalignImage(output, true);
-    return true;
-  }
-
-  OutputParam& param() { return param_; }
-
- private:
-  OutputParam param_;
-};
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/pooling_pe.hpp b/mobile/src/fpga/KD/pes/pooling_pe.hpp
deleted file mode 100644
index 421f30cd3368c63ade58546559ccd9129208c063..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/pooling_pe.hpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-namespace paddle_mobile {
-namespace zynqmp {
-
-class PoolingPE : public PE {
- public:
-  bool init() {
-    Tensor* output = param_.output;
-    output->setAligned(true);
-    return true;
-  }
-
-  void apply() {
-    Tensor* input = param_.input;
-    Tensor* output = param_.output;
-
-    uint32_t k_width = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
-
-    if (param_.globalPooling) {
-      k_width = input->shape().width();
-      k_height = input->shape().height();
-    }
-
-    PoolingArgs args = {0};
-    args.mode = param_.type;
-    args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
-    args.image.address = input->data<float16>();
-    args.image.channels = input->shape().channel();
-    args.image.height = input->shape().height();
-    args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = param_.paddings[1];
-    args.image.scale_address = input->scale();
-    args.output.address = output->mutableData<float16>();
-    args.output.scale_address = output->scale();
-    args.kernel.height = k_height;
-    args.kernel.width = k_width;
-    args.kernel.stride_h = param_.strides[0];
-    args.kernel.stride_w = param_.strides[1];
-    args.out_height = output->shape().height();
-    args.out_width = output->shape().width();
-    param_.poolingArgs = args;
-  }
-
-  bool dispatch() { return compute_fpga_pool(param_.poolingArgs) == 0; }
-
-  PoolingParam& param() { return param_; }
-
- private:
-  PoolingParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/softmax_pe.cpp b/mobile/src/fpga/KD/pes/softmax_pe.cpp
deleted file mode 100644
index f4596d3aa736a846934b58ec9b456565ddfb5fc5..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/softmax_pe.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "softmax_pe.hpp"
-
-#include <vector>
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifndef __aarch64__
-static inline float32_t vmaxvq_f32(const float32x4_t &r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-
-static inline float32_t vaddvq_f32(const float32x4_t &r) {
-  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON__
-
-static float find_max(const float *input, const int num_classes) {
-  int remain = num_classes;
-  float max = -std::numeric_limits<float>::max();
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  for (int i = 0; i < loop; ++i, input += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    __max = vmaxq_f32(x0, __max);
-    __max = vmaxq_f32(x1, __max);
-  }
-  max = vmaxvq_f32(__max);
-#endif
-  for (int i = 0; i < remain; ++i) {
-    max = std::max(max, input[i]);
-  }
-  return max;
-}
-
-static void softmax(Tensor *X, Tensor *Y) {
-  std::vector<int> dims = X->shape().dims();
-  int batch_size = X->shape().num();
-  int num_classes = dims[X->shape().dimSize() - 1];
-  int channels = X->shape().numel() / batch_size / num_classes;
-  float *x = X->data<float>();
-  float *y = Y->mutableData<float>();
-
-#pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      // find max
-      float max = find_max(input, num_classes);
-
-      // exp(x - max)
-      int remain = num_classes;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      int loop = num_classes >> 3;
-      remain = num_classes & 0x7;
-      float32x4_t __max = vdupq_n_f32(max);
-      for (int i = 0; i < loop; ++i, input += 8, output += 8) {
-        float32x4_t x0 = vld1q_f32(input);
-        float32x4_t x1 = vld1q_f32(input + 4);
-        x0 = vsubq_f32(x0, __max);
-        x1 = vsubq_f32(x1, __max);
-        x0 = exp_ps(x0);
-        x1 = exp_ps(x1);
-        vst1q_f32(output, x0);
-        vst1q_f32(output + 4, x1);
-      }
-#endif  // __ARM_NEON__
-      for (int i = 0; i < remain; ++i) {
-        output[i] = expf(input[i] - max);
-      }
-
-      // sum(exp(x - max))
-      float sum = 0.f;
-      output = y + offset;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      float32x4_t __sum = vdupq_n_f32(0.f);
-      for (int i = 0; i < loop; ++i, output += 8) {
-        float32x4_t x0 = vld1q_f32(output);
-        float32x4_t x1 = vld1q_f32(output + 4);
-        __sum = vaddq_f32(x0, __sum);
-        __sum = vaddq_f32(x1, __sum);
-      }
-      sum += vaddvq_f32(__sum);
-#endif  // __ARM_NEON__
-      for (int i = 0; i < remain; ++i) {
-        sum += output[i];
-      }
-
-      // exp(x - max) / sum
-      float inv_sum = 1.f / sum;
-      output = y + offset;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-      float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
-      for (int i = 0; i < loop; ++i, output += 8) {
-        float32x4_t x0 = vld1q_f32(output);
-        float32x4_t x1 = vld1q_f32(output + 4);
-        x0 = vmulq_f32(x0, __inv_sum);
-        x1 = vmulq_f32(x1, __inv_sum);
-        vst1q_f32(output, x0);
-        vst1q_f32(output + 4, x1);
-      }
-#endif
-      for (int i = 0; i < remain; ++i) {
-        output[i] *= inv_sum;
-      }
-    }
-  }
-}
-
-bool SoftmaxPE::init() {
-  Tensor *output = param_.output;
-  output->setAligned(false);
-  return true;
-}
-
-bool SoftmaxPE::dispatch() {
-  Tensor *input = param_.input;
-  Tensor *output = param_.output;
-  input->invalidate();
-
-  Tensor float_input;
-  Tensor float_output;
-  float_input.mutableData<float>(DataType::FP32, input->shape());
-  float_input.copyFrom(input);
-  float_input.unalignImage();
-
-  float *out_data =
-      float_output.mutableData<float>(DataType::FP32, input->shape());
-
-  softmax(&float_input, &float_output);
-  float_output.flush();
-
-  output->copyFrom(&float_output);
-  return true;
-}
-
-SoftmaxParam &SoftmaxPE::param() { return param_; }
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/pes/softmax_pe.hpp b/mobile/src/fpga/KD/pes/softmax_pe.hpp
deleted file mode 100644
index 42b4014616347175b2b960285f98bae90202015f..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/pes/softmax_pe.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <math.h>
-#include <algorithm>
-#include <limits>
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#include "fpga/KD/pes/math_func_neon.h"
-#endif
-
-#include "../pe.hpp"
-#include "../pe_params.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-class SoftmaxPE : public PE {
- public:
-  bool init();
-  bool dispatch();
-
-  SoftmaxParam& param();
-
- private:
-  SoftmaxParam param_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/shape.hpp b/mobile/src/fpga/KD/shape.hpp
deleted file mode 100644
index 587df103100fa798fdc52b1ca59d7dde7e9664ce..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/shape.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <vector>
-
-#include "fpga/KD/alignment.h"
-#include "fpga/KD/layout.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-static struct NCHW nchw_;
-static struct NHWC nhwc_;
-static struct NC nc_;
-static struct NHW nhw_;
-static struct N n_;
-
-class Shape {
- public:
-  explicit Shape(std::vector<int> dims) { dims_ = dims; }
-
-  Shape(LayoutType type, std::vector<int> dims) {
-    dims_ = dims;
-    setLayoutType(type);
-  }
-
-  Shape(const Shape& src) {
-    dims_ = src.dims_;
-    setLayoutType(src.layoutType_);
-  }
-
-  bool shouldAlign() {
-    return layout_->alignedElementCount(dims_) != layout_->elementCount(dims_);
-  }
-
-  int num() {
-    int index = layout_->numIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int channel() {
-    int index = layout_->channelIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int height() {
-    int index = layout_->heightIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int width() {
-    int index = layout_->widthIndex();
-    return index == -1 ? 1 : dims_[index];
-  }
-
-  int dimSize() { return dims_.size(); }
-
-  std::vector<int> dims() { return dims_; }
-
-  size_t memorySize(int cellSize) {
-    return layout_->alignedElementCount(dims_) * cellSize;
-  }
-
-  int numel() { return layout_->elementCount(dims_); }
-
-  void setLayoutType(LayoutType layout) {
-    this->layoutType_ = layout;
-    switch (layout) {
-      case NCHW:
-        layout_ = &nchw_;
-        break;
-      case NHWC:
-        layout_ = &nhwc_;
-        break;
-      case NC:
-        layout_ = &nc_;
-        break;
-      case NHW:
-        layout_ = &nhw_;
-        break;
-      case N:
-        layout_ = &n_;
-        break;
-      default:
-        break;
-    }
-  }
-
-  int operator[](int index) { return dims_[index]; }
-
- private:
-  LayoutType layoutType_;
-  Layout* layout_ = &nhwc_;
-  std::vector<int> dims_;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/tensor.hpp b/mobile/src/fpga/KD/tensor.hpp
deleted file mode 100644
index 496d6f77928cc39875c925ad8477252b3f129b3e..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/tensor.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "float16.hpp"
-#include "llapi/zynqmp_api.h"
-#include "shape.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-
-enum DataType : int {
-  FP32 = 0,
-  FP16 = 1,
-  INT8 = 2,
-};
-
-typedef uint16_t float16;
-
-inline int CellSize(DataType type) {
-  switch (type) {
-    case FP32:
-      return sizeof(float);
-    case FP16:
-      return sizeof(float16);
-    case INT8:
-      return sizeof(int8_t);
-    default:
-      return 0;
-  }
-  return 0;
-}
-
-class PlaceHolder {
- public:
-  explicit PlaceHolder(size_t size) {
-    size_ = size;
-    data_ = fpga_malloc(size_);
-  }
-
-  void* data() { return data_; }
-
-  size_t memorySize() { return size_; }
-
-  ~PlaceHolder() {
-    std::cout << "place holder dealloc";
-    fpga_free(data_);
-  }
-
- private:
-  void* data_ = nullptr;
-  size_t size_ = 0;
-};
-
-class Tensor {
- public:
-  int id() { return id_; }
-
-  template <typename Dtype>
-  Dtype* data() {
-    if (placeHolder_ == nullptr) {
-      return nullptr;
-    }
-    return reinterpret_cast<Dtype*>(this->placeHolder_->data());
-  }
-
-  template <typename Dtype>
-  Dtype* mutableData(DataType dataType, const Shape& shape) {
-    // if (this->shape_ != &shape) {
-    if (this->shape_ != nullptr) {
-      delete shape_;
-    }
-    this->shape_ = new Shape(shape);
-    // }
-    this->dataType_ = dataType;
-    return mutableData<Dtype>();
-  }
-
-  template <typename Dtype>
-  Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
-    if (placeHolder_ != nullptr) {
-      if (memorySize > placeHolder_->memorySize()) {
-        delete placeHolder_;
-        placeHolder_ = new PlaceHolder(memorySize);
-      }
-    } else {
-      placeHolder_ = new PlaceHolder(memorySize);
-    }
-    return reinterpret_cast<Dtype*>(placeHolder_->data());
-  }
-
-  void setDataType(DataType dataType) { this->dataType_ = dataType; }
-
-  DataType dataType() { return this->dataType_; }
-
-  Shape& shape() { return *shape_; }
-
-  bool aligned() { return this->aligned_; }
-
-  void setAligned(bool aligned) { this->aligned_ = aligned; }
-
-  float* scale() { return scale_; }
-
-  void alignImage(Tensor* dst = nullptr, bool copy = false) {
-    if (shape_->shouldAlign()) {
-      int cell_size = CellSize(this->dataType_);
-      char* dst_data = nullptr;
-      size_t mem_size = shape_->memorySize(cell_size);
-      if (dst == nullptr) {
-        dst_data = reinterpret_cast<char*>(fpga_malloc(mem_size));
-      } else {
-        dst_data = dst->data<char>();
-      }
-      int wc = shape_->width() * shape_->channel();
-      int wc_aligned = align_image(wc);
-      int remainder = wc_aligned - wc;
-
-      char* src_start = data<char>();
-      char* dst_start = dst_data;
-      for (int n = 0; n < shape_->num(); n++) {
-        for (int h = 0; h < shape_->height(); h++) {
-          memcpy(dst_start, src_start, wc * cell_size);
-          memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
-          src_start += wc * cell_size;
-          dst_start += wc_aligned * cell_size;
-        }
-      }
-      if (dst == nullptr) {
-        memcpy(data<void>(), dst_data, mem_size);
-        flush();
-        fpga_free(dst_data);
-      } else {
-        dst->flush();
-      }
-    } else {
-      if (copy) {
-        dst->copyFrom(this);
-      } else {
-        // TODO(chonwhite) share data.
-      }
-    }
-  }
-
-  void unalignImage(Tensor* dst = nullptr, bool copy = false) {
-    if (shape_->shouldAlign()) {
-      // int cell_size = CellSize(this->dataType_);
-      // char* dst_data = nullptr;
-      // size_t mem_size = shape_->memorySize(cell_size);
-      // if (dst == nullptr) {
-      //     dst_data = (char*)fpga_malloc(mem_size);
-      // } else {
-      //     dst_data = dst->data<char>();
-      // }
-      // int wc = shape_->width() * shape_->channel();
-      // int wc_aligned = align_image(wc);
-      // int remainder = wc_aligned - wc;
-
-      // char* src_start = data<char>();
-      // char* dst_start = dst_data;
-      // for (int n = 0; n < shape_->num(); n++) {
-      //     for (int h = 0;h < shape_->height(); h++) {
-      //         memcpy(dst_start, src_start, wc * cell_size);
-      //         memcpy(dst_start + wc * cell_size, 0, remainder * cell_size);
-      //         src_start += wc * cell_size;
-      //         dst_start += wc_aligned * cell_size;
-      //     }
-      // }
-      // if (dst == nullptr) {
-      //     memcpy(data<void>(), dst_data, mem_size);
-      //     flush();
-      //     fpga_free(dst_data);
-      // } else {
-      //     dst->flush();
-      // }
-    } else {
-      if (copy) {
-        dst->copyFrom(this);
-      } else {
-        // TODO(chonwhite) share data.
-      }
-    }
-  }
-
-  void copyFrom(Tensor* src) {
-    BypassArgs args;
-    args.input_data_type =
-        src->dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
-    args.output_data_type = dataType_ == FP32 ? DATA_TYPE_FP32 : DATA_TYPE_FP16;
-    args.input_layout_type = LAYOUT_HWC;
-    args.output_layout_type = LAYOUT_HWC;
-    args.image = {.address = src->data<void>(),
-                  .scale_address = src->scale(),
-                  .channels = (uint32_t)src->shape().channel(),
-                  .width = (uint32_t)src->shape().width(),
-                  .height = (uint32_t)src->shape().height(),
-                  .pad_width = 0u,
-                  .pad_height = 0u};
-    args.output = {
-        .address = data<void>(),
-        .scale_address = scale(),
-    };
-    src->flush();
-    perform_bypass(args);
-    this->invalidate();
-  }
-
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
-
-  void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
-  }
-
-  void print() {
-    int count = shape_->numel();
-    for (int i = 0; i < count; i++) {
-      std::cout << "" << '\n';
-    }
-  }
-
-  void saveToFile() {
-    std::string path = std::to_string(id_) + ".txt";
-    saveToFile(path);
-  }
-
-  void saveToFile(std::string path) {
-    std::ofstream ofs;
-    static int counter = 0;
-    std::string npath = std::to_string(counter) + "_" + path;
-    counter++;
-    ofs.open(npath);
-    for (size_t i = 0; i < shape_->numel(); i++) {
-      float value = 0;
-      if (dataType_ == FP32) {
-        value = data<float>()[i];
-      } else {
-        value = half_to_float(data<float16>()[i]);
-      }
-      ofs << value << std::endl;
-    }
-    ofs.close();
-  }
-
- private:
-  float scale_[2];
-  Shape* shape_ = nullptr;
-  DataType dataType_ = FP32;
-  bool aligned_ = false;
-
-  static int generateID() {
-    static int sID = 0;
-    int id = sID++;
-    return id;
-  }
-
-  int id_ = generateID();
-
-  PlaceHolder* placeHolder_ = nullptr;
-};
-
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/tensor_util.cpp b/mobile/src/fpga/KD/tensor_util.cpp
deleted file mode 100644
index 29b6595788d8aded07a5db705d928aa5a136b074..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/tensor_util.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-
-#include "tensor_util.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-float find_max(const Tensor& tensor) {
-  float max = 0;
-  Tensor& t = const_cast<Tensor&>(tensor);
-  float* data = t.data<float>();
-  for (int i = 0; i < t.shape().numel(); i++) {
-    max = std::max(data[i], max);
-  }
-  return max;
-}
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/KD/tensor_util.hpp b/mobile/src/fpga/KD/tensor_util.hpp
deleted file mode 100644
index 81d86f22f759e06885d5434c04cc5efe69bbf2db..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/KD/tensor_util.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdio.h>
-
-#include "tensor.hpp"
-
-namespace paddle_mobile {
-namespace zynqmp {
-float find_max(const Tensor& tensor);
-}  // namespace zynqmp
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/api.cpp b/mobile/src/fpga/V1/api.cpp
deleted file mode 100644
index dc5163d2b2055a11dfb5565755b8c3cfd206b444..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/api.cpp
+++ /dev/null
@@ -1,1021 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/api.h"
-#include <memory>
-#include "fpga/V1/bias_scale.h"
-#include "fpga/V1/deconv_filter.h"
-#include "fpga/V1/filter.h"
-#include "fpga/V1/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-#define USE_RELU 1
-#define USE_BIAS 2
-
-void format_image(framework::Tensor *image_tensor) {
-  auto dims = image_tensor->dims();
-  auto channel = dims[1], height = dims[2], width = dims[3];
-  kTypeId_t input_type = image_tensor->type();
-  if (input_type == type_id<float>()) {
-    auto data_ptr = image_tensor->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
-    float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-
-    image::format_image<float>(&p_data, channel, height, width);
-    if (p_data != data_ptr && external_ptr == nullptr) {
-      image_tensor->reset_data_ptr(p_data);
-    }
-  } else {
-    auto data_ptr = image_tensor->data<int8_t>();
-    auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
-    int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-
-    image::format_image<int8_t>(&p_data, channel, height, width);
-    if (p_data != data_ptr && external_ptr == nullptr) {
-      image_tensor->reset_data_ptr(p_data);
-    }
-  }
-}
-
-void format_ofm(framework::Tensor *ofm_tensor) {
-  if (ofm_tensor->type() == type_id<float>()) {
-    format_fp32_ofm(ofm_tensor);
-  } else {
-    format_fp16_ofm(ofm_tensor);
-  }
-}
-void format_fp16_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(half);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  // memset(p, 0, memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<half>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
-  // auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  // memset(p, 0, memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<half>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(half);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_fp32_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size =
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
-  } else if (dims.size() == 2) {
-    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  // memset(p, 0, memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<float>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
-  fpga::fpga_flush(p, memory_size);
-}
-
-float filter_find_max(framework::Tensor *filter_tensor) {
-  auto filter_ptr = filter_tensor->data<float>();
-  return filter::find_max(filter_ptr, filter_tensor->numel());
-}
-
-int get_plit_num(framework::Tensor *filter_tensor) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
-                                  int group_num, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-void format_filter(framework::Tensor *filter_tensor, float max_value,
-                   int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_filter(&new_data, num, channel, height, width, group_num,
-                        max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
-                           int stride) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
-
-  num = dims[1];
-  int channel = dims[0];
-
-  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
-                                       scale_ptr, stride);
-
-  //  framework::DDim dims_new =
-  //      framework::make_ddim({num, 1, height, width});
-  //  filter_tensor->Resize(dims_new);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
-                           max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
-                          int group_num, int stride) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
-
-  num = dims[1];
-  channel = dims[0];
-  deconv_filter::deconv_format_filter(
-      &new_data, (int)num, (int)channel,          // NOLINT
-      (int)height,                                // NOLINT
-      (int)width, group_num, max_value, stride);  // NOLINT
-
-  framework::DDim dims_new =
-      framework::make_ddim({num, channel, height, width});
-  filter_tensor->Resize(dims_new);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  bias_scale::format_bias_scale_array(bias_scale_array,
-                                      element_num_per_division, num);
-}
-void format_bias_array(float **bias_array, int num) {
-  bias_scale::format_bias_array(bias_array, num);
-}
-
-void format_concat_output(framework::Tensor *out, int height, int width,
-                          int image_num, uint32_t *channel_num) {
-  int sum_channel = 0, sum_cw = 0;
-  for (int i = 0; i < image_num; i++) {
-    sum_channel += channel_num[i];
-  }
-
-  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
-  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
-  auto ddim = framework::make_ddim({1, sum_channel, height, width});
-  out->Resize(ddim);
-  out->reset_data_ptr(data_ptr);
-  out->fpga_data_num = sum_cw * height;
-  out->set_type(type_id<half>().hash_code());
-}
-void format_conv_data(framework::Tensor *filter_tensor,
-                      framework::Tensor *ofm_tensor, float **bs_ptr,
-                      int group) {
-  float max_value = fpga::filter_find_max(filter_tensor);
-  fpga::format_filter(filter_tensor, max_value, group);
-  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
-  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
-                                ofm_tensor->dims()[1]);
-  fpga::format_fp16_ofm(ofm_tensor);
-}
-void format_deconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float **bs_ptr,
-                        int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  float max_value = filter_find_max(filter_tensor);
-  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
-  int element_num_per_div =
-      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
-  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
-  format_fp16_ofm(ofm_tensor);
-}
-
-void format_dwconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float *scale_ptr,
-                        float **bias_ptr) {
-  auto channel = ofm_tensor->dims()[1];
-  format_dwconv_filter(filter_tensor, scale_ptr);
-  format_bias_array(bias_ptr, channel);
-  format_fp16_ofm(ofm_tensor);
-}
-void format_DWDeconv_data(framework::Tensor *filter_tensor,
-                          framework::Tensor *ofm_tensor, float **bs_ptr,
-                          int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  // dw-deconv
-  format_DWDconv_filter(
-      filter_tensor,
-      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
-  format_bias_array(bs_ptr, channel);
-  format_fp16_ofm(ofm_tensor);
-}
-void expand_conv_arg(ConvArgs *arg) {
-  ConvArgs args = *arg;
-
-  auto fpga_bias_scale_len =
-      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
-
-  auto output_height =
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1;
-  auto output_width =
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1;
-
-  auto filter_per_group = args.filter_num / args.group_num;
-  auto channel_per_group = args.image.channels / args.group_num;
-
-  auto image_row_count = args.image.width * args.image.channels;
-  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
-  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-                               args.image.pad_width * args.image.channels;
-  auto filter_amount_all =
-      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
-                 FILTER_ELEMENT_ALIGNMENT);
-
-  auto output_amount_per_row = align_to_x(
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
-      IMAGE_ALIGNMENT);
-
-  // find the opt partition strategy
-  uint64_t res_win;
-  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win++) {
-    if ((align_to_x(
-             (args.image.channels *
-              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
-             IMAGE_ALIGNMENT) /
-             16 +
-         1) *
-            args.kernel.height >
-        2048) {
-      break;
-    }
-  }
-
-  if (res_win != output_width) {
-    res_win -= 1;
-  }
-
-  if (((res_win % 2) != 0) && (res_win != 1)) {
-    res_win = res_win - 1;
-  }
-  res_fit = res_win;
-
-  auto block_num = (output_width + res_fit - 1) / res_fit;
-  auto block_len = res_fit;
-  auto block_last = output_width - res_fit * (block_num - 1);
-
-  auto res_amount_per_row =
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num;
-  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
-
-  auto image_block_amount_per_row =
-      args.kernel.stride_w * res_fit * args.image.channels;
-  auto filter_pad_width_mul_channel =
-      args.image.pad_width * args.image.channels;
-  auto image_amount_per_row_multi_win_first =
-      image_amount_per_row *
-      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
-  auto image_amount_per_row_multi_win =
-      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
-
-  auto image_block_num = block_num;
-  auto image_block_len =
-      align_to_x((args.image.channels *
-                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
-                 IMAGE_ALIGNMENT) /
-          16 +
-      1;
-  auto image_block_len_last =
-      align_to_x(
-          (args.image.channels *
-           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
-          IMAGE_ALIGNMENT) /
-          16 +
-      1;
-  auto image_win_cnt = block_len;
-  auto image_win_cnt_last = block_last;
-  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1;
-  if (prog_full_cnt == 511) {
-    prog_full_cnt--;
-  }
-  auto post_prog_full_cnt =
-      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
-          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
-          : 0;
-  // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
-  auto cmd = 0UL | USE_BIAS;
-
-  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
-                      ((args.deconv_tx_param.sub_conv_num) << 8) |
-                      ((args.deconv_tx_param.omit_size) << 0);
-  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
-  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
-  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
-  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
-                                     args.deconv_tx_param.out_addr_offset;
-  (*arg).driver.output_height = output_height;
-  (*arg).driver.output_width = output_width;
-  (*arg).driver.filter_per_group = filter_per_group;
-  (*arg).driver.channel_per_group = channel_per_group;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
-  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
-  (*arg).driver.filter_amount_all = filter_amount_all;
-  (*arg).driver.output_amount_per_row = output_amount_per_row;
-  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
-  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
-  (*arg).driver.image_amount_per_row_multi_win_first =
-      image_amount_per_row_multi_win_first;
-  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
-  (*arg).driver.image_block_num = image_block_num;
-  (*arg).driver.image_block_len = image_block_len;
-  (*arg).driver.image_block_len_last = image_block_len_last;
-  (*arg).driver.image_win_cnt = image_win_cnt;
-  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
-  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
-  (*arg).driver.prog_full_cnt = prog_full_cnt;
-  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
-  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
-  (*arg).driver.cmd = cmd;
-  (*arg).driver.deconv_param = deconv_param;
-}  // expand_conv_arg()
-
-void expand_EW_arg(EWAddArgs *arg) {
-  EWAddArgs args = *arg;
-  // uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
-  uint64_t cmd = 0;
-  uint64_t datalen = (uint64_t)args.image0.width *
-                     (uint64_t)args.image0.height *
-                     (uint64_t)args.image0.channels;
-  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
-  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
-  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
-  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
-
-  (*arg).driver.image0_address_phy = image0_address_phy;
-  (*arg).driver.image1_address_phy = image1_address_phy;
-  (*arg).driver.datalen = datalen;
-  (*arg).driver.image_image_pixel = image_image_pixel;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
-  (*arg).driver.output_address_phy = output_address_phy;
-  (*arg).driver.coefficient = coefficient;
-  (*arg).driver.cmd = cmd;
-}  // expand_EW_arg
-
-void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
-                    framework::Tensor *out, framework::Tensor *filter,
-                    ActivationType activation_enable,
-                    int16_t leaky_relu_negative_slope, int group_num,
-                    int stride_h, int stride_w, int padding_h, int padding_w,
-                    float *bs_ptr) {
-  auto input_ptr = input->data<half>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto out_ptr = out->data<half>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  // Either group_num or split_num = 1;
-  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-  arg->conv_arg =
-      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
-
-  arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);
-
-  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
-
-  arg->concat_arg.image_num = arg->split_num;
-  arg->concat_arg.image_out = out_ptr;
-  arg->concat_arg.scale_out = out->scale;
-  arg->concat_arg.height = (uint32_t)out->dims()[2];
-  arg->concat_arg.width = (uint32_t)out->dims()[3];
-
-  int n = arg->split_num;
-  arg->concat_arg.images_in =
-      static_cast<int16_t **>(fpga_malloc(n * sizeof(int *)));
-  arg->concat_arg.scales_in =
-      static_cast<float **>(fpga_malloc(n * sizeof(float *)));
-  arg->concat_arg.channel_num =
-      static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
-
-  auto channel = (int)out->dims()[1];  // NOLINT
-  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = get_aligned_filter_element_num(
-      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
-            filter->dims()[3]));
-
-  for (int i = 0; i < n; i++) {
-    // arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].output.activation.activation_type = activation_enable;
-    arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
-        leaky_relu_negative_slope;
-    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_arg[i].filter_num = (uint32_t)(
-        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                   : filter_num_per_div);
-
-    size_t filter_size =
-        element_num *
-        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
-        sizeof(int8_t);
-    auto filter_head = &(
-        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
-    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
-    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
-    // for test
-    //    {
-    //    static int cnt = 0;
-    //    if(cnt == 4){
-    //        int8_t result = 0;
-    //        std::string str = "fc_filter";
-    //      fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
-    //      filter_size, result);
-    //
-    //    }
-    //    cnt++;
-    //}
-
-    size_t bs_size = 2 *
-                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
-                     sizeof(float);
-    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
-    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
-    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
-    // for test
-    /*{
-    static int cnt = 0;
-    if(cnt == 4){
-        float result = 0;
-        std::string str = "fc_bs";
-      fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
-result);
-
-    }
-    cnt++;
-}*/
-
-    if (n > 1) {
-      arg->conv_arg[i].output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->conv_arg[i].output.address =
-          fpga_malloc(out->dims()[2] *
-                      align_to_x((int)(out->dims()[3] *  // NOLINT
-                                       arg->conv_arg[i].filter_num),
-                                 IMAGE_ALIGNMENT) *
-                      sizeof(half));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
-          deleter));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
-    } else {
-      arg->conv_arg[i].output.scale_address = out->scale;
-      arg->conv_arg[i].output.address = out_ptr;
-    }
-
-    arg->concat_arg.images_in[i] =
-        (half *)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
-    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
-
-    expand_conv_arg(&arg->conv_arg[i]);
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_split_arg
-
-void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int group_num,
-                     int stride_h, int stride_w, int padding_h, int padding_w,
-                     float *bs_ptr) {
-  auto input_ptr = input->data<half>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  arg->sub_conv_num = (uint32_t)stride_h;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  uint32_t sub_conv_num = arg->sub_conv_num;
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
-  fpga::format_fp16_ofm(out, dims_out_new);
-  auto out_ptr = out->data<half>();
-  arg->output.address =
-      (half *)out_ptr +  // NOLINT
-      omit_size * sizeof(half) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-  arg->output.scale_address = out->scale;
-
-  uint32_t conv_output_size =
-      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
-      sub_output_height;
-  uint32_t split_num =
-      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
-    arg->split_conv_args[i]->filter_num =
-        (arg->sub_conv_num) * (arg->filter_num);
-    arg->split_conv_args[i]->group_num = (uint32_t)group_num;
-    arg->split_conv_args[i]->split_num = split_num;
-    arg->split_conv_args[i]->concat_arg.height = sub_output_height;
-    arg->split_conv_args[i]->concat_arg.width = sub_output_width;
-    arg->split_conv_args[i]->concat_arg.image_num = split_num;
-
-    arg->split_conv_args[i]->conv_arg =
-        static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
-    arg->split_conv_args[i]->concat_arg.images_in =
-        static_cast<int16_t **>(fpga_malloc(split_num * sizeof(int16_t *)));
-    arg->split_conv_args[i]->concat_arg.scales_in =
-        static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
-    arg->split_conv_args[i]->concat_arg.channel_num =
-        static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
-    arg->split_conv_args[i]->shared_conv_arg =
-        std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.images_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.scales_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.channel_num),
-            deleter));
-  }
-
-  auto filter_num_per_div =
-      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
-  int element_num = get_aligned_filter_element_num(
-      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT
-
-  int chw = sub_channels * sub_filter_width * sub_filter_width;
-  int division_capacity = filter::calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = sub_filter_num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  int filter_sub_conv_offset = element_num * num_after_alignment;
-  uint32_t out_addr_offset = 0;
-  for (int i = 0; i < sub_conv_num; ++i) {
-    if (sub_conv_num == 1) {
-      arg->split_conv_args[i]->output.address = arg->output.address;
-      arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
-      out_addr_offset = 0;
-
-    } else {
-      out_addr_offset =
-          sizeof(int16_t) * (sub_conv_num - 1 - i) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-
-      arg->split_conv_args[i]->output.address = out_ptr;
-      arg->split_conv_args[i]->output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->output.scale_address),
-              deleter));
-    }
-
-    for (int j = 0; j < split_num; ++j) {
-      arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
-          activation_enable;
-      arg->split_conv_args[i]
-          ->conv_arg[j]
-          .output.activation.leaky_relu_negative_slope =
-          leaky_relu_negative_slope;
-      arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
-
-      arg->split_conv_args[i]->conv_arg[j].kernel.width =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.height =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
-
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
-          sub_conv_num;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
-          omit_size;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
-          out_addr_offset;
-
-      arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
-      arg->split_conv_args[i]->conv_arg[j].image.channels =
-          (uint32_t)sub_channels;
-      arg->split_conv_args[i]->conv_arg[j].image.width =
-          (uint32_t)input->dims()[3];
-      arg->split_conv_args[i]->conv_arg[j].image.height =
-          (uint32_t)input->dims()[2];
-      arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
-
-      arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
-      arg->split_conv_args[i]->conv_arg[j].filter_num =
-          (uint32_t)(j == split_num - 1
-                         ? sub_filter_num - (split_num - 1) * filter_num_per_div
-                         : filter_num_per_div);
-
-      size_t filter_size =
-          element_num *
-          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
-                     FILTER_NUM_ALIGNMENT) *
-          sizeof(int8_t);
-      auto filter_head = &((
-          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
-                               i * filter_sub_conv_offset];
-      arg->split_conv_args[i]->conv_arg[j].filter_address =
-          fpga_malloc(filter_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].filter_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
-             filter_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
-                 filter_size);
-
-      size_t bs_align_num = align_to_x(
-          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
-      size_t bs_size = 2 * bs_align_num * sizeof(float);
-      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
-
-      arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].sb_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
-
-      if (split_num == 1) {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            arg->split_conv_args[i]->output.address;
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            arg->split_conv_args[i]->output.scale_address;
-      } else {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            fpga_malloc(conv_output_size * sizeof(int16_t));
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.address),
-                deleter));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.scale_address),
-                deleter));
-      }
-      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
-          arg->split_conv_args[i]->conv_arg[j].output.address);
-      arg->split_conv_args[i]->concat_arg.scales_in[j] =
-          arg->split_conv_args[i]->conv_arg[j].output.scale_address;
-      arg->split_conv_args[i]->concat_arg.channel_num[j] =
-          arg->split_conv_args[i]->conv_arg[j].filter_num;
-
-      expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
-    }
-
-    arg->split_conv_args[i]->concat_arg.image_out =
-        arg->split_conv_args[i]->output.address;
-    arg->split_conv_args[i]->concat_arg.scale_out =
-        arg->split_conv_args[i]->output.scale_address;
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_deconv_arg
-
-void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int stride_h,
-                     int stride_w, int padding_h, int padding_w,
-                     float *bias_ptr) {
-  auto deleter = [](void *p) { fpga_free(p); };
-  arg->vector_dwconv_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
-
-  auto filter_ptr = filter->data<int16_t>();
-  auto input_ptr = input->data<half>();
-  auto output_ptr = out->mutable_data<half>();
-  arg->sub_conv_num = 1;
-  // arg->relu_enabled = relu_enabled;
-  arg->output.activation.activation_type = activation_enable;
-  arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
-  arg->bias_address = bias_ptr;
-  arg->filter_address = filter_ptr;
-  arg->kernel.height = (uint32_t)filter->dims()[2];
-  arg->kernel.width = (uint32_t)filter->dims()[3];
-  arg->kernel.stride_h = (uint32_t)stride_h;
-  arg->kernel.stride_w = (uint32_t)stride_w;
-  arg->image.address = input_ptr;
-  arg->image.channels = (uint32_t)input->dims()[1];
-  arg->image.height = (uint32_t)input->dims()[2];
-  arg->image.width = (uint32_t)input->dims()[3];
-  arg->image.pad_height = (uint32_t)padding_h;
-  arg->image.pad_width = (uint32_t)padding_w;
-  arg->image.scale_address = input->scale;
-  arg->output.address = output_ptr;
-  arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
-                       framework::Tensor *out, framework::Tensor *filter,
-                       ActivationType activation_enable,
-                       int16_t leaky_relu_negative_slope, int stride_h,
-                       int stride_w, int padding_h, int padding_w,
-                       float *bias_ptr) {
-  auto filter_ptr = filter->data<int8_t>();
-  auto input_ptr = input->data<half>();
-
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)filter->dims()[0];
-  arg->sub_conv_num = (uint32_t)stride_w;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-
-  int sub_conv_num = stride_w;
-
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, real_out_height, real_out_width});
-  fpga::format_fp16_ofm(out, dims_out_new);
-  auto out_ptr = out->data<half>();
-
-  /*====For Addition
-  arg->output.address =
-      (half *)out_ptr +  // NOLINT
-      omit_size * sizeof(half) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-          */
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-
-  int filter_offset = sub_filter_width * sub_filter_width *
-                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
-                      arg->sub_conv_num;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
-
-    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
-    // arg->dw_conv_args[i]->relu_enabled = relu_enabled;
-    arg->dw_conv_args[i]->output.activation.activation_type = activation_enable;
-    arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
-        leaky_relu_negative_slope;
-    arg->dw_conv_args[i]->bias_address = bias_ptr;
-
-    arg->dw_conv_args[i]->filter_address =
-        fpga_malloc(filter_offset * sizeof(int16_t));
-    memcpy(arg->dw_conv_args[i]->filter_address,
-           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
-           filter_offset * sizeof(int16_t));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
-        deleter));
-
-    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
-    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
-
-    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
-    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
-    arg->dw_conv_args[i]->image.address = input_ptr;
-    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
-    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
-    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
-
-    arg->dw_conv_args[i]->image.pad_height = sub_pad;
-    arg->dw_conv_args[i]->image.pad_width = sub_pad;
-    arg->dw_conv_args[i]->image.scale_address = input->scale;
-
-    arg->dw_conv_args[i]->output.address =
-        fpga_malloc(sub_output_height *
-                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
-                               IMAGE_ALIGNMENT) *
-                    sizeof(int16_t));
-    arg->dw_conv_args[i]->output.scale_address =
-        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
-        deleter));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
-        deleter));
-  }
-
-  // arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/api.h b/mobile/src/fpga/V1/api.h
deleted file mode 100644
index 33a5d3d33fe610f872f2e0846cd99f2b42d589f3..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/api.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "fpga/common/fpga_common.h"
-#include "fpga/common/pe.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-void format_image(framework::Tensor* image_tensor);
-void format_ofm(framework::Tensor* ofm_tensor);
-void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
-void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
-void format_fp32_ofm(framework::Tensor* ofm_tensor);
-
-float filter_find_max(framework::Tensor* filter_tensor);
-int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
-int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
-                                  int group_num, int stride);
-
-int get_plit_num(framework::Tensor* filter_tensor);
-int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
-
-int get_aligned_filter_element_num(int chw);
-void format_filter(framework::Tensor* filter_tensor, float max_value,
-                   int group_num);
-void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-void format_concat_output(framework::Tensor* out, int height, int width,
-                          int image_num, uint32_t* channel_num);
-
-void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
-                    framework::Tensor* out, framework::Tensor* filter,
-                    ActivationType activation_enable,
-                    int16_t leaky_relu_negative_slope, int group_num,
-                    int stride_h, int stride_w, int padding_h, int padding_w,
-                    float* bs_ptr);
-void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int group_num,
-                     int stride_h, int stride_w, int padding_h, int padding_w,
-                     float* bs_ptr);
-void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     ActivationType activation_enable,
-                     int16_t leaky_relu_negative_slope, int stride_h,
-                     int stride_w, int padding_h, int padding_w,
-                     float* bias_ptr);
-void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
-                       framework::Tensor* out, framework::Tensor* filter,
-                       ActivationType activation_enable,
-                       int16_t leaky_relu_negative_slope, int stride_h,
-                       int stride_w, int padding_h, int padding_w,
-                       float* bs_ptr);
-
-void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
-                          int group_num, int stride);
-void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
-void format_conv_data(framework::Tensor* filter_tensor,
-                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
-void format_deconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float** bs_ptr,
-                        int group, int sub_conv_n);
-void format_dwconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float* scale_ptr,
-                        float** bias_ptr);
-void format_DWDeconv_data(framework::Tensor* filter_tensor,
-                          framework::Tensor* ofm_tensor, float** bs_ptr,
-                          int group, int sub_conv_n);
-
-template <typename Dtype>
-void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
-  float data;
-  std::ofstream out(filename.c_str());
-  for (int i = 0; i < dataSize; ++i) {
-    data = (((Dtype*)buffer)[i]);  // NOLINT
-    out << data << std::endl;
-  }
-  out.close();
-  return;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/bias_scale.cpp b/mobile/src/fpga/V1/bias_scale.cpp
deleted file mode 100644
index ffb5303c854bc0ac96ce05441a199232ac22d54b..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/bias_scale.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/bias_scale.h"
-#include <memory.h>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  // num_after_alignment: number of bias after alignment
-
-  float *ptr_uninterleaved = *data_in;
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-           4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/bias_scale.h b/mobile/src/fpga/V1/bias_scale.h
deleted file mode 100755
index 9ebdc71bce1df1bd15b4be395de18c57f5ed3c09..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/bias_scale.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_bias_scale.cpp b/mobile/src/fpga/V1/deconv_bias_scale.cpp
deleted file mode 100644
index 0bcc91ddd295603d248639329773e74ef2e33a6a..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/deconv_bias_scale.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/deconv_bias_scale.h"
-// #include "deconv_bias_scale.h"
-#include "fpga/V1/bias_scale.h"
-// #include "bias_scale.h"
-#include <memory.h>
-
-#include "fpga/V1/api.h"
-// #include "fpga_api.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n) {
-  int sub_num = num * sub_conv_n;
-  float* ptr_tmp = *bias_scale_array;
-  float* ptr_bias_scale_expand =
-      (float*)fpga_malloc(sizeof(float) * sub_num * 2);
-  int scale_base_offset = sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    int offset = num * i;
-    // copy bias
-    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
-    // copy scale
-    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
-              num * sizeof(float));
-  }
-  *bias_scale_array = ptr_bias_scale_expand;
-  fpga_free(ptr_tmp);
-}
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_bias_scale.h b/mobile/src/fpga/V1/deconv_bias_scale.h
deleted file mode 100644
index 820c6984d439f945ea4fc5f560fb346869026003..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/deconv_bias_scale.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n);
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_filter.cpp b/mobile/src/fpga/V1/deconv_filter.cpp
deleted file mode 100644
index 36a02578bca6698b510c18947d1e8463108cad8b..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/deconv_filter.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/deconv_filter.h"
-#include <memory.h>
-#include <algorithm>
-// #include "deconv_filter.h"
-#include "fpga/V1/filter.h"
-// #include "filter.h"
-#include "fpga/V1/api.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-/*
-inverse kernel weights of each channel for every filter
-*/
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height) {
-  float* tmp = *data_in;
-  int data_size = num * channel * width * height;
-  int hw_len = height * width;
-  auto tmp_data =
-      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < channel; ++j) {
-      for (int k = 0; k < hw_len; ++k) {
-        tmp_data[i * channel * hw_len + j * hw_len + k] =
-            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
-      }
-    }
-  }
-  *data_in = tmp_data;
-  fpga_free(tmp);
-}
-
-/*
-    calculate sub padding number
-*/
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
-  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
-    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
-  }
-  return (filter_axis - pad - 1) / stride;
-}
-int deconv_get_sub_filter_axis(int filter_axis, int stride) {
-  return (filter_axis / stride);
-}
-
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
-  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
-}
-
-/*
-    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
-   position. so the omit rows or columns is (stride - )
-*/
-int deconv_get_omit(int stride, int filter_width, int pad) {
-  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
-  int idx;
-  bool flag = false;
-  for (idx = 1; idx <= stride; ++idx) {
-    int j = idx;
-    for (; j <= filter_width;) {
-      if (j == filter_width - pad) {
-        flag = true;
-        break;
-      }
-      j = j + stride;
-    }
-    if (flag) {
-      break;
-    }
-  }
-
-  return (stride - idx);
-}
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel) {
-  T* ptr_tmp = *data_in;
-  int sub_num = kernel_num * sub_conv_n;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-
-  int sub_filter_size =
-      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
-
-  T* ptr_sub_filter =
-      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    for (int nn = 0; nn < sub_num; ++nn) {
-      int ni = nn % kernel_num;
-
-      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
-
-      for (int hh = 0; hh < sub_h; ++hh) {
-        int hi = hh * sub_conv_n + idx % sub_conv_n;
-        for (int ww = 0; ww < sub_w; ++ww) {
-          int wi = ww * sub_conv_n + woff;  // 1 0
-
-          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
-          int kidx = ((ni * height + hi) * width + wi) * channel;  //
-
-          fpga_copy(
-              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
-              (*data_in) + kidx, channel * sizeof(T));
-          // for (int cc =0; cc < channel; ++cc) {
-          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
-          //     (*data_in)[kidx + cc];
-          // }
-        }
-      }
-    }
-  }
-  *data_in = ptr_sub_filter;
-  fpga_free(ptr_tmp);
-}
-
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
-                       int hw) {
-  float* tmp = *filter_in;
-  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
-      hw * kernel_num * channels * sizeof(float)));
-
-  for (int c = 0; c < channels; ++c) {
-    for (int n = 0; n < kernel_num; ++n) {
-      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
-                                     tmp + n * channels * hw + c * hw,
-                                     hw * sizeof(float));
-    }
-  }
-  *filter_in = ptr_filter;
-  paddle_mobile::fpga::fpga_free(tmp);
-}
-
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride) {
-  int data_size = channel * height * width * num;
-
-  /*{
-       float result2 = (float)0;
-       string filename = "origin_filter_data";
-       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-    }*/
-
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  /* {
-          float result2 = (float)0;
-          string filename = "inverse_filter_data";
-          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-   }*/
-
-  filter::quantize(data_in, data_size, max);
-  /* {
-        char result2 = (char)0;
-        string filename = "quantize_filter_data";
-        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
- }*/
-  char** quantize_data = (char**)data_in;  // NOLINT
-
-  filter::convert_to_hwc(quantize_data, num, channel, height, width);
-  /*{
-       char result2 = (char)0;
-       string filename = "convert_to_hwc_filter_data";
-       api::savefile<char>(filename, (void *)*quantize_data, data_size,
-  result2);
-  }*/
-
-  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
-                              channel);
-  /*{
-     char result2 = (char)0;
-     string filename = "sub_filter_filter_data";
-     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
-}*/
-
-  int sub_conv_n = stride;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-  int sub_chw = sub_h * sub_w * channel;
-  int sub_num = sub_conv_n * num;
-  int division_capacity = filter::calc_division_capacity(sub_chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = (sub_num) % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  char** ptr_ptr_data =
-      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
-  int origin_offset = sub_chw * sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    (ptr_ptr_data)[i] =
-        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
-    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
-              origin_offset * sizeof(char));
-
-    /* char result2 = (char)0;
-     string filename = "ptr_ptr_data" + to_string(i);
-     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
-     result2);
-     */
-  }
-  // char result2 = (char)0;
-  //      string filename = "interleave";
-  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
-  //      result2);
-  fpga_free(*quantize_data);
-
-  int align_offset =
-      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
-  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
-      sub_conv_n * align_offset * sizeof(char)));  // continuous space
-  for (int i = 0; i < sub_conv_n; ++i) {
-    char* ptr_tmp = (ptr_ptr_data)[i];
-
-    filter::align_element(&ptr_tmp, sub_num, sub_chw);
-    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
-
-    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
-    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
-
-    /*   char result2 = (char)0;
-       string filename = "interleave" + to_string(i);
-       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
-*/
-    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
-    fpga_free(ptr_tmp);
-  }
-  fpga_free(ptr_ptr_data);
-  *data_in = reinterpret_cast<float*>(ptr_space);
-
-  /*    {
-        char result2 = (char)0;
-         string filename = "ptr_space";
-         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
-     align_offset, result2);
-      }*/
-  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
-}
-
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride) {
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
-  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
-  filter::convert_to_hwn(quantize_data, channel, height, width);
-
-  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
-                                 channel);
-
-  filter::align_element_n(quantize_data, channel, height, width);
-  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/deconv_filter.h b/mobile/src/fpga/V1/deconv_filter.h
deleted file mode 100644
index f1a50b95c52dadc49f4dd333791a22f63bf6d0a3..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/deconv_filter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height);
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
-int deconv_get_sub_filter_axis(int filter_axis, int stride);
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
-int deconv_get_omit(int stride, int filter_width, int pad);
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel);
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride);
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride);
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/filter.cpp b/mobile/src/fpga/V1/filter.cpp
deleted file mode 100644
index 425d1d1b5c3d0304de06dae9bb0e9fcf32f4d957..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/filter.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw) {
-  int n = 2048 / ((chw + 15) / 16) * 32;
-  return n < 2048 ? n : 2048;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(char **data_in, int num, int channel, int height,
-                    int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int i = 0;
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in, int num_per_div_before_alignment, int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_filter(float **data_in, int num, int channel, int height, int width,
-                   int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-
-void convert_fc_filter(char **data_in, int num, int chw) {
-  char *tmp = *data_in;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < chw; c++) {
-      data_tmp[n * chw + c] = (*data_in)[num * c + n];
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_fc_filter(float **data_in, int num, int channel, int height,
-                      int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_fc_filter(quantize_data, num, chw);
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  if (unalign_n == align_n) {
-    return;
-  } else {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-void quantize_to_fp16(float **data_in, int num, int height, int width,
-                      float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
-      }
-    }
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-void format_dwconv_filter(float **data_in, int num, int height, int width,
-                          float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-void format_DWDeconv_filter(float **data_in, int num, int height, int width,
-                            float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/filter.h b/mobile/src/fpga/V1/filter.h
deleted file mode 100755
index 4812a75af2af97047f4b46a5dc7fdb9dfa11b456..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/filter.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstdint>
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(char** data_in, int num, int channel, int height,
-                    int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in, int num_per_div_before_alignment, int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-void interleave(char** data_in, int num_after_alignment, int chw);
-void format_filter(float** data_in, int num, int channel, int height, int width,
-                   int group_num, float max);
-
-void convert_fc_filter(char** data_in, int num, int chw);
-void format_fc_filter(float** data_in, int num, int channel, int height,
-                      int width, int group_num, float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-void align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(float** data_in, int num, int height, int width,
-                      float* scale_ptr);
-void format_dwconv_filter(float** data_in, int num, int height, int width,
-                          float* scale_ptr);
-
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/image.cpp b/mobile/src/fpga/V1/image.cpp
deleted file mode 100644
index 4ba5af83ab26a8b21ea868c8a28bb94da5216c69..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/image.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V1/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp = reinterpret_cast<float *>(
-      fpga_malloc(num * channel * height * width * sizeof(float)));
-  int64_t amount_per_row = width * channel;
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * channel * height * width + offset_height +
-            w * channel + c) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * height * width * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void concat_images(int16_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, uint32_t *channel_num,
-                   int height, int width) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int each_out_line_channel = 0;
-  int align_each_out_area_cw = 0;
-  int align_each_in_area_cw = 0;
-  int align_each_out_area_cw_differ = 0;
-  int tmp_channel = 0;
-  scale_out[0] = 0.0;
-  scale_out[1] = 0.0;
-  for (i = 0; i < image_num; i++) {
-    each_out_line_channel += channel_num[i];
-    scale_out[0] = std::max(*scale_out, scales_in[i][0]);
-    fpga_invalidate(images_in[i],
-                    height *
-                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-                        sizeof(int16_t));
-  }
-  scale_out[1] = 1 / scale_out[0];
-  align_each_out_area_cw =
-      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
-  align_each_out_area_cw_differ =
-      align_each_out_area_cw - each_out_line_channel * width;
-
-  for (k = 0; k < height; k++) {
-    for (j = 0; j < width; j++) {
-      for (i = 0; i < image_num; i++) {
-        align_each_in_area_cw =
-            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
-                   k * align_each_out_area_cw_differ,
-               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
-               channel_num[i] * sizeof(int16_t));
-
-        tmp_channel += channel_num[i];
-      }
-    }
-  }
-
-  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
-}
-
-void split_image(int16_t *image_in, const float *scale_in, void **images_out,
-                 float **scales_out, int image_num,
-                 const uint32_t *channel_nums, int height, int width) {
-  int total_channel = 0;
-  for (int i = 0; i < image_num; i++) {
-    scales_out[i][0] = scale_in[0];
-    scales_out[i][1] = scale_in[1];
-    total_channel += channel_nums[i];
-  }
-  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
-  fpga_invalidate(image_in, element_num * sizeof(int16_t));
-
-  int src_offset = 0, des_offset = 0;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
-                   w * total_channel;
-      for (int i = 0; i < image_num; i++) {
-        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
-                     w * channel_nums[i];
-        memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
-               image_in + src_offset, channel_nums[i] * sizeof(int16_t));
-        src_offset += channel_nums[i];
-      }
-    }
-  }
-
-  for (int i = 0; i < image_num; i++) {
-    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
-    fpga_flush(images_out[i], element_num * sizeof(int16_t));
-  }
-}
-
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/image.h b/mobile/src/fpga/V1/image.h
deleted file mode 100644
index f5dc6ffe3e1d9747bf4c9cfd86f5a951e7b0ac24..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/image.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory.h>
-#include <algorithm>
-#include <cstdint>
-#include "fpga/common/fpga_common.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float** data_in, int channel, int height, int width,
-                    int num = 1);
-void convert_to_chw(float** data_in, int channel, int height, int width,
-                    int num = 1);
-// template <typename Dtype>
-// void align_element_conv(Dtype** data_in, int height, int cw);
-// template <typename T>
-// void format_image(T** data_in, int channel, int height, int width);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-
-  Dtype* data_tmp =
-      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
-
-  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
-
-  for (h = 0; h < height; h++) {
-    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
-           (void*)(*data_in + h * cw),        // NOLINT
-           cw * sizeof(Dtype));
-  }
-
-  *data_in = data_tmp;
-}
-template <typename T>
-void format_image(T** data_in, int channel, int height, int width) {
-  int cw = channel * width;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    T* hwc_temp = *data_in;
-    align_element_conv(data_in, height, channel * width);
-    fpga_free(hwc_temp);
-  }
-  fpga_flush(*data_in,
-             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
-}
-// Concat featuremaps along channel direction
-void concat_images(int16_t** images_in, float** scales_in, void* image_out,
-                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height, int width);
-
-// Split featuremap along channel direction
-void split_image(int16_t* image_in, const float* scale_in, void** images_out,
-                 float** scales_out, int image_num,
-                 const uint32_t* channel_nums, int height, int width);
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V1/pe.cpp b/mobile/src/fpga/V1/pe.cpp
deleted file mode 100644
index fef971a3488e7b526a1cf1cea2bbc54893ab0d86..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V1/pe.cpp
+++ /dev/null
@@ -1,1180 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/pe.h"
-#include "common/enforce.h"
-#include "common/types.h"
-#include "fpga/V1/filter.h"
-#include "fpga/V1/image.h"
-#include "fpga/common/config.h"
-#include "fpga/common/driver.h"
-#include "fpga/common/fpga_common.h"
-#ifdef COST_TIME_PRINT
-#include <sys/time.h>
-#include <time.h>
-#include <iomanip>
-#include <iostream>
-#endif
-
-namespace paddle_mobile {
-namespace fpga {
-
-using namespace driver;  // NOLINT
-using namespace std;     // NOLINT
-#define USE_RELU 1
-#define USE_BIAS 2
-
-// bypass cmd
-#define CMD_FP16_TO_FP16 0
-#define CMD_FP16_TO_FP32 1
-#define CMD_FP32_TO_FP16 2
-#define CMD_FP32_TO_FP32 3
-#define CMD_INT8_TO_FP16 4
-
-// bypass macro
-#define SIZE_FP16 2
-#define SIZE_FP32 4
-#define SIZE_INT8 1
-
-#define PE_IRQ_TIMEOUT 1000000
-
-/* Interrupt bit-set offset*/
-#define INTERRUPT_RSVD 0x0001
-#define INTERRUPT_BYPASS 0x0002
-#define INTERRUPT_CONV 0x0004
-#define INTERRUPT_POOLING 0x0008
-#define INTERRUPT_EW 0x0010
-
-/* Register offset */
-#define REG_INTERRUPT 0x000
-#define REG_VERSION 0x008
-#define REG_TEMPERATURE 0x010
-#define REG_FPGA_RESET 0x018
-#define REG_TEST_REGISTER 0x048
-#define REG_HARDWARE_STATUS 0x050
-
-#define REG_TIMER_COUNTER 0x070
-
-#define REG_SCALE_PARAMETER 0x080
-#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090
-
-#define REG_FLASH_CMD 0x200
-#define REG_FLASH_DATA 0x208
-#define REG_FLASH_CONFIG 0x210
-#define REG_FLASH_STATUS 0x218
-#define REG_SN 0x220
-
-/*bypass*/
-#define REG_CONVERT_CMD 0x400
-#define REG_CONVERT_SRC_ADDR 0x408
-#define REG_CONVERT_DST_ADDR 0x410
-#define REG_CONVERT_LENGTH 0x418
-
-/*resize*/
-#define REG_RESIZE_CMD 0x600
-#define REG_RESIZE_CHANNEL_NUMBER 0x608
-#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
-#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
-#define REG_RESIZE_INPUT_BASE_ADDR 0x620
-#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
-#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
-#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
-
-/*pooling*/
-#define REG_POOLING_CMD 0x800
-#define REG_POOLING_IMAGE_BASE_ADDR 0x808
-#define REG_POOLING_RESULT_BASE_ADDR 0x810
-#define REG_POOLING_IMAGE_PIXEL 0x818
-#define REG_POOLING_WINDOW_SIZE 0x820
-#define REG_POOLING_RESULT_PIXEL 0x828
-#define REG_POOLING_PAD_PIXEL 0x830
-#define REG_POOLING_STEP_PIXEL 0x838
-#define REG_POOLING_CHANNEL_NUMBER 0x840
-#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
-#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
-#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
-#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
-#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
-#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
-#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
-#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
-#define REG_POOLING_MODE_RECIPROCAL 0x890
-
-/*conv*/
-#define REG_CONV_CMD 0xC00
-#define REG_CONV_IMAGE_BASE_ADDR 0xC08
-#define REG_CONV_FILTER_BASE_ADDR 0xC10
-#define REG_CONV_SB_BASE_ADDR 0xC18
-#define REG_CONV_RESULT_BASE_ADDR 0xC20
-#define REG_CONV_IMAGE_PIXEL 0xC28
-#define REG_CONV_FILTER_PIXEL 0xC30
-#define REG_CONV_RESULT_PIXEL 0xC38
-#define REG_CONV_PAD_PIXEL 0xC40
-#define REG_CONV_STEP_PIXEL 0xC48
-#define REG_CONV_GROUP_NUMBER 0xC50
-#define REG_CONV_FILTER_NUMBER 0xC58
-#define REG_CONV_CHANNEL_NUMBER 0xC60
-#define REG_CONV_FILTER_PER_GROUP 0xC68
-#define REG_CONV_CHANNEL_PER_GROUP 0xC70
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
-#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
-#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
-#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
-#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
-#define REG_CONV_RESULT_LAST_VALID 0xCA0
-
-#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
-#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
-#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
-#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
-#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
-#define REG_CONV_IMAGE_WIN_CNT 0xCE0
-#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
-#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
-#define REG_CONV_PROG_FULL_CNT 0xD08
-#define REG_CONV_POST_PROG_FULL_CNT 0xD10
-#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
-
-#define REG_CONV_IMAGE_SCALE 0xD28
-#define REG_CONV_FILTER_SCALE 0xD30
-
-/*ew*/
-#define REG_EW_CMD 0x0F00
-#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
-#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
-#define REG_EW_RESULT_BASE_ADDR 0x0F18
-#define REG_EW_DATA_LEN 0x0F20
-#define REG_EW_COEFFICIENT 0x0F28
-#define REG_EW_IMAGE_PIXEL 0x0F30
-#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
-
-/*dwconv*/
-#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
-#define REG_DWCONV_FILTER_SHAPE 0xe10
-#define REG_DWCONV_FILTER_N_ALIGN 0xe18
-#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
-#define REG_DWCONV_CMD 0xe00
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-//  ComputeBasicConv(args.conv_arg[0]);
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-  int ret = 0;
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ret |= ComputeBasicConv(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-
-  return ret;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "======Compute Basic Conv======";
-  // DLOG << "   relu_enabled:" << args.relu_enabled
-  DLOG << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  DLOG << "   activation_type:" << active_args.activation_type
-       << "   leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_CONV_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_CONV_FILTER_PIXEL);
-
-  uint64_t output_height_fraction =
-      args.driver.output_height / ROW_PARALLEL_NUM;
-  uint64_t output_height_remainder =
-      args.driver.output_height % ROW_PARALLEL_NUM;
-  reg_writeq(args.driver.output_height | (output_height_fraction << 16) |
-                 (output_height_remainder << 26) |
-                 (args.driver.output_width << 32),
-             REG_CONV_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_CONV_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_CONV_STEP_PIXEL);
-  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
-  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
-  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
-  reg_writeq(*(uint64_t *)args.image.scale_address,  // NOLINT
-             REG_CONV_IMAGE_SCALE);
-  reg_writeq(*(uint64_t *)args.filter_scale_address,  // NOLINT
-             REG_CONV_FILTER_SCALE);
-  reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
-  reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
-  reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
-  reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
-  reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
-  reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
-  reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
-  reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
-  reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
-  reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
-  reg_writeq(args.driver.image_block_num, 0xcc8);
-  reg_writeq(args.driver.image_block_len, 0xcd0);
-  reg_writeq(args.driver.image_block_len_last, 0xcd8);
-  reg_writeq(args.driver.image_win_cnt, 0xce0);
-  reg_writeq(args.driver.image_win_cnt_last, 0xce8);
-  reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
-  reg_writeq(args.driver.prog_full_cnt, 0xd08);
-  reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
-  reg_writeq(args.driver.deconv_param, 0xd18);
-  reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
-  reg_writeq(args.driver.cmd, REG_CONV_CMD);
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Conv Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
-  }
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  active_args.activation_type = NONE;
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeBasicConv
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Polling";
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  DLOG << "   activation_type:" << active_args.activation_type
-       << "   leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  image_physical_address = vaddr_to_paddr_driver(args.image.address);
-  output_physical_address = vaddr_to_paddr_driver(args.output.address);
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
-                             (((uint64_t)args.kernel_reciprocal));
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
-  reg_writeq(cmd, REG_POOLING_CMD);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  active_args.activation_type = NONE;
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaPool
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  // DLOG << "   relu_enabled:" << args.relu_enabled
-  DLOG << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  ActivationArgs active_args;
-  active_args.activation_type = args.output.activation.activation_type;
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-  DLOG << "    activation_type:" << active_args.activation_type
-       << "    leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "    reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
-    ret = -EIO;
-    DLOG << "EW Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
-  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
-  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
-  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
-  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
-  reg_writeq(args.driver.cmd, REG_EW_CMD);
-
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
-    ret = -EIO;
-    DLOG << "EW Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
-  }
-
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  active_args.activation_type = NONE;
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaEWAdd
-
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  uint64_t cmd = 0;
-  uint64_t datalen = 0;
-  uint64_t input_address_phy = 0;
-  uint64_t output_address_phy = 0;
-  uint8_t data_cell_in = 0;
-  uint8_t data_cell_out = 0;
-  int ret = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  ActivationArgs active_args;
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
-            (uint64_t)args.image.channels;
-  datalen = align_to_x(datalen, 16);
-  input_address_phy = vaddr_to_paddr_driver(args.image.address);
-  output_address_phy = vaddr_to_paddr_driver(args.output.address);
-  DLOG << "input_phy:" << input_address_phy;
-  DLOG << "output_phy:" << output_address_phy;
-
-  switch (args.input_data_type) {
-    case DATA_TYPE_FP16: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP16_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP16_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    case DATA_TYPE_INT8: {
-      if (args.output_data_type != DATA_TYPE_FP16) {
-        DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: "
-             << args.output_data_type;
-      }
-      data_cell_in = SIZE_INT8;
-      data_cell_out = SIZE_FP16;
-      cmd = CMD_INT8_TO_FP16;
-    } break;
-
-    case DATA_TYPE_FP32: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP32_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP32_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    default:
-      break;
-  }
-  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
-      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 &&
-      cmd != CMD_INT8_TO_FP16) {
-    //   std::cout<< " err back Error1!" <<std::endl;
-    return -EFAULT;
-  }
-  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32 &&
-       data_cell_in != SIZE_INT8) ||
-      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
-    return -EFAULT;
-  }
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
-    ret = -EIO;
-    DLOG << "Bypass Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
-  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
-  reg_writeq(datalen, REG_CONVERT_LENGTH);
-  reg_writeq(cmd, REG_CONVERT_CMD);
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
-    ret = -EIO;
-    DLOG << "BYPASS Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // PerformBypass
-
-uint64_t FPGAVersion() {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t fpga_ver = 0;
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return fpga_ver;
-#endif
-  return 0;
-}  // FPGAVersion
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out
-       << "   out_channel:" << args.out_channel;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:"
-         << args.channel_num[i]
-         //<< "   aligned_channel_num:" << args.aligned_channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}  // ComputeFPGAConcat
-
-void deconv_post_process(const struct DeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, 16);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.split_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t =
-          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                          ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-void DWDeconv_post_process(const struct DWDeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.dw_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                                   ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-
-int ComputeFpgaDeconv(const struct DeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeFpgaConv(*args.split_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.split_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-
-    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-    /*#ifdef COST_TIME_PRINT
-    gettimeofday(&start,NULL);
-    #endif
-        //deconv_post_process(args);
-    #ifdef COST_TIME_PRINT
-        gettimeofday(&end,NULL);
-     dif_sec = end.tv_sec - start.tv_sec;
-     dif_usec = end.tv_usec - start.tv_usec;
-      std::cout << "deconv_post_process  " << "    cost time: "  <<
-    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
-  }
-
-  return 0;
-}  // ComputeFpgaDeconv
-
-int ComputeFPGASplit(const struct SplitArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaSplit===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   in_address:" << args.image_in
-       << "   in_scale_address:" << args.scale_in;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.out_channel_nums[i]
-         << "   image_address:" << args.images_out[i]
-         << "   image_scale_address:" << args.scales_out[i];
-  }
-#endif
-  image::split_image(args.image_in, args.scale_in, args.images_out,
-                     args.scales_out, args.image_num, args.out_channel_nums,
-                     args.height, args.width);
-  return 0;
-}  // ComputeFPGASplit
-int ComputeDWConv(const struct DWconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeDWConv===========";
-  // DLOG << "   mode:" << args.relu_enabled;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   filter_address:" << args.filter_address
-       << "   bias_address:" << args.bias_address;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "DWConv";
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  // uint64_t cmd = args.relu_enabled;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-  uint64_t filter_physical_address = 0;
-  uint64_t bias_physical_address = 0;
-
-  image_physical_address = vaddr_to_paddr(args.image.address);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-  filter_physical_address = vaddr_to_paddr(args.filter_address);
-  bias_physical_address = vaddr_to_paddr(args.bias_address);
-  uint64_t filter_N_align =
-      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t filter_amount_per_row_align =
-      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = filter_N_align *
-                                     (uint64_t)args.kernel.width *
-                                     (uint64_t)args.kernel.height;
-  uint64_t filter_amount_align =
-      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
-
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w +
-       1) *
-      args.sub_conv_num);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-
-  uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  DLOG << "   activation_type:" << active_args.activation_type
-       << "   leaky_relu_negative_slope:"
-       << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "DWConv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-  /*restart scale*/
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq((bias_physical_address << 32 | filter_physical_address),
-             REG_DWCONV_FILTER_BASE_ADDR);
-  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
-             REG_DWCONV_FILTER_SHAPE);
-  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
-             REG_DWCONV_FILTER_SUBNUMBER);
-  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
-
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-
-  /*SDK刷Cache保证数据一致性*/
-
-  reg_writeq(cmd, REG_DWCONV_CMD);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
-  }
-  DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  DLOG << "output_scale:" << output_scale;
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}
-int ComputeDWDeconv(const struct DWDeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeDWConv(*args.dw_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.dw_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-#ifdef COST_TIME_PRINT
-  gettimeofday(&start, NULL);
-#endif
-  DWDeconv_post_process(args);
-#ifdef COST_TIME_PRINT
-  gettimeofday(&end, NULL);
-  dif_sec = end.tv_sec - start.tv_sec;
-  dif_usec = end.tv_usec - start.tv_usec;
-  std::cout << "deconv_post_process  "
-            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-            << std::endl;
-#endif
-  return 0;
-}  // ComputeFpgaDeconv
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp
deleted file mode 100644
index 1a90cb5bdc8b0cf96785b59cc37076b2beaa2572..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/api.cpp
+++ /dev/null
@@ -1,1011 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/api.h"
-#include <memory>
-#include "fpga/V2/bias_scale.h"
-#include "fpga/V2/deconv_filter.h"
-#include "fpga/V2/filter.h"
-#include "fpga/V2/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-#define USE_RELU 1
-#define USE_BIAS 2
-
-void format_image(framework::Tensor *image_tensor) {
-  auto dims = image_tensor->dims();
-  auto channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = image_tensor->data<int8_t>();
-  auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
-  int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-
-  image::format_image<int8_t>(&p_data, channel, height, width);
-  if (p_data != data_ptr) {
-    image_tensor->reset_data_ptr(p_data);
-  }
-}
-
-void format_ofm(framework::Tensor *ofm_tensor) {
-  if (ofm_tensor->type() == type_id<float>()) {
-    format_fp32_ofm(ofm_tensor);
-  } else {
-    format_int8_ofm(ofm_tensor);
-  }
-}
-
-void format_int8_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
-         height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(int8_t);
-  } else if (dims.size() == 2) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
-    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<int8_t>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
-         height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(int8_t);
-  } else if (dims.size() == 2) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
-    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(int8_t);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<int8_t>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t);
-  fpga::fpga_flush(p, memory_size);
-}
-
-void format_fp32_ofm(framework::Tensor *ofm_tensor) {
-  auto dims = ofm_tensor->dims();
-  size_t memory_size = 0;
-  if (dims.size() == 4) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1],
-         height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-                  sizeof(float);
-  } else if (dims.size() == 2) {
-    auto num = (dims[0] == 0) ? 1 : dims[0], channel = dims[1];
-    memory_size = num * align_to_x(channel, IMAGE_ALIGNMENT) * sizeof(float);
-  } else {
-    DLOG << "Wrong ofm dimension";
-  }
-  auto p = fpga_malloc(memory_size);
-  ofm_tensor->reset_data_ptr(p);
-  ofm_tensor->set_type(type_id<float>().hash_code());
-  ofm_tensor->fpga_data_num = memory_size / sizeof(float);
-  fpga::fpga_flush(p, memory_size);
-}
-
-float filter_find_max(framework::Tensor *filter_tensor) {
-  auto filter_ptr = filter_tensor->data<float>();
-  return filter::find_max(filter_ptr, filter_tensor->numel());
-}
-
-int get_plit_num(framework::Tensor *filter_tensor) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-int get_deconv_plit_num(framework::Tensor *filter_tensor, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
-}
-
-int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] * dims[3];
-  auto num = dims[0];
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_deconv_filter_num_per_div(framework::Tensor *filter_tensor,
-                                  int group_num, int stride) {
-  auto dims = filter_tensor->dims();
-  auto chw = dims[1] * dims[2] / stride * dims[3] / stride;
-  auto num = dims[0] * stride;
-  int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_num_per_div(num, group_num, div_capacity);
-}
-
-int get_aligned_filter_element_num(int chw) {
-  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-}
-
-void format_filter(framework::Tensor *filter_tensor, float max_value,
-                   int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_filter(&new_data, num, channel, height, width, group_num,
-                        max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
-                           int stride) {
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, 1, hw);
-
-  num = dims[1];
-  int channel = dims[0];
-
-  deconv_filter::DWDconv_format_filter(&new_data, num, channel, height, width,
-                                       scale_ptr, stride);
-
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int16_t>().hash_code());
-}
-
-void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  fpga_copy(new_data, data_ptr, memory_size);
-  filter::format_fc_filter(&new_data, num, channel, height, width, 1,
-                           max_value);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
-                          int group_num, int stride) {
-  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
-  auto dims = filter_tensor->dims();
-  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
-  auto data_ptr = filter_tensor->data<float>();
-  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
-  memcpy(new_data, data_ptr, memory_size);
-
-  int hw = height * width;
-  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
-
-  num = dims[1];
-  channel = dims[0];
-  deconv_filter::deconv_format_filter(
-      &new_data, (int)num, (int)channel,          // NOLINT
-      (int)height,                                // NOLINT
-      (int)width, group_num, max_value, stride);  // NOLINT
-
-  framework::DDim dims_new =
-      framework::make_ddim({num, channel, height, width});
-  filter_tensor->Resize(dims_new);
-  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(type_id<int8_t>().hash_code());
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  bias_scale::format_bias_scale_array(bias_scale_array,
-                                      element_num_per_division, num);
-}
-void format_bias_array(float **bias_array, int num) {
-  bias_scale::format_bias_array(bias_array, num);
-}
-
-void format_concat_output(framework::Tensor *out, int height, int width,
-                          int image_num, uint32_t *channel_num) {
-  int sum_channel = 0, sum_cw = 0;
-  for (int i = 0; i < image_num; i++) {
-    sum_channel += channel_num[i];
-  }
-
-  sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
-  auto data_ptr = fpga_malloc(height * sum_cw * sizeof(int8_t));
-  auto ddim = framework::make_ddim({1, sum_channel, height, width});
-  out->Resize(ddim);
-  out->reset_data_ptr(data_ptr);
-  out->set_type(type_id<int8_t>().hash_code());
-}
-void format_conv_data(framework::Tensor *filter_tensor,
-                      framework::Tensor *ofm_tensor, float **bs_ptr,
-                      int group) {
-  float max_value = fpga::filter_find_max(filter_tensor);
-  fpga::format_filter(filter_tensor, max_value, group);
-  int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
-  fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
-                                ofm_tensor->dims()[1]);
-  fpga::format_ofm(ofm_tensor);
-}
-void format_deconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float **bs_ptr,
-                        int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  float max_value = filter_find_max(filter_tensor);
-  format_deconv_filter(filter_tensor, max_value, group, sub_conv_n);
-  int element_num_per_div =
-      get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
-  format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
-  format_ofm(ofm_tensor);
-}
-
-void format_dwconv_data(framework::Tensor *filter_tensor,
-                        framework::Tensor *ofm_tensor, float *scale_ptr,
-                        float **bias_ptr) {
-  auto channel = ofm_tensor->dims()[1];
-  format_dwconv_filter(filter_tensor, scale_ptr);
-  format_bias_array(bias_ptr, channel);
-  format_ofm(ofm_tensor);
-}
-void format_DWDeconv_data(framework::Tensor *filter_tensor,
-                          framework::Tensor *ofm_tensor, float **bs_ptr,
-                          int group, int sub_conv_n) {
-  int channel = ofm_tensor->dims()[1];
-  format_DWDconv_filter(
-      filter_tensor,
-      (reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
-  format_bias_array(bs_ptr, channel);
-  format_ofm(ofm_tensor);
-}
-
-void expand_conv_arg(ConvArgs *arg) {
-  ConvArgs args = *arg;
-
-  auto fpga_bias_scale_len =
-      align_to_x(args.filter_num / args.group_num, BS_NUM_ALIGNMENT) *
-      args.group_num;
-  fpga_bias_scale_len = fpga_bias_scale_len / BIAS_SCALE_DMA_NUM;
-
-  auto output_height =
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1;
-  auto output_width =
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1;
-
-  auto filter_per_group = args.filter_num / args.group_num;
-  auto channel_per_group = args.image.channels / args.group_num;
-
-  auto image_row_count = args.image.width * args.image.channels;
-  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
-  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-                               args.image.pad_width * args.image.channels;
-  auto filter_amount_all =
-      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
-                 FILTER_ELEMENT_ALIGNMENT);
-
-  auto output_amount_per_row = align_to_x(
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num,
-      RESULT_ALIGNMENT);
-
-  // find the opt partition strategy
-  uint64_t res_win;
-  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win++) {
-    if ((align_to_x(
-             (args.image.channels *
-              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
-             IMAGE_ALIGNMENT) /
-             IMAGE_ALIGNMENT +
-         1) *
-            args.kernel.height >
-        256) {
-      break;
-    }
-  }
-
-  if (res_win != output_width) {
-    res_win -= 1;
-  }
-
-  if (((res_win % 2) != 0) && (res_win != 1)) {
-    res_win = res_win - 1;
-  }
-  //  PADDLE_MOBILE_ENFORCE(res_win >= 2, "window too bigger than fpga volume");
-  res_fit = res_win;
-
-  auto block_num = (output_width + res_fit - 1) / res_fit;
-  auto block_len = res_fit;
-  auto block_last = output_width - res_fit * (block_num - 1);
-
-  auto res_amount_per_row =
-      (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num;
-  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
-
-  auto image_block_amount_per_row =
-      args.kernel.stride_w * res_fit * args.image.channels;
-  auto filter_pad_width_mul_channel =
-      args.image.pad_width * args.image.channels;
-  auto image_amount_per_row_multi_win_first =
-      image_amount_per_row *
-      (ROW_PARALLEL_NUM * args.kernel.stride_h - args.image.pad_height);
-  auto image_amount_per_row_multi_win =
-      image_amount_per_row * (ROW_PARALLEL_NUM * args.kernel.stride_h);
-
-  auto image_block_num = block_num;
-  auto image_block_len =
-      align_to_x((args.image.channels *
-                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
-                 IMAGE_ALIGNMENT) /
-          IMAGE_ALIGNMENT +
-      1;
-  auto image_block_len_last =
-      align_to_x(
-          (args.image.channels *
-           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
-          IMAGE_ALIGNMENT) /
-          IMAGE_ALIGNMENT +
-      1;
-  auto image_win_cnt = block_len;
-  auto image_win_cnt_last = block_last;
-  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  auto prog_full_cnt = 1024 / (filter_amount_all / 16 * 2) - 1;
-  if (prog_full_cnt == 511) {
-    prog_full_cnt--;
-  }
-  auto post_prog_full_cnt =
-      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
-          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
-          : 0;
-  auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
-  // auto cmd = 0UL | USE_BIAS;
-
-  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
-                      ((args.deconv_tx_param.sub_conv_num) << 8) |
-                      ((args.deconv_tx_param.omit_size) << 0);
-
-  (*arg).driver.filter_per_group = filter_per_group;
-  (*arg).driver.channel_per_group = channel_per_group;
-  (*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
-  (*arg).driver.deconv_param = deconv_param;
-  // new
-  (*arg).driver.col_padding_up = args.image.pad_width * args.image.channels;
-  (*arg).driver.col_padding_down = image_one_pad_per_row;
-  (*arg).driver.row_padding_up = args.image.pad_height;
-  (*arg).driver.row_padding_down = args.image.pad_height + args.image.height;
-  (*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
-  (*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
-  (*arg).driver.image_win_cnt = image_win_cnt;
-  (*arg).driver.image_win_cnt_last = image_win_cnt_last;
-  (*arg).driver.filter_row = args.kernel.width * args.image.channels;
-  (*arg).driver.filter_width = args.kernel.width;
-  (*arg).driver.filter_height = args.kernel.height;
-  (*arg).driver.skip_window = args.image.channels * args.kernel.stride_w;
-  (*arg).driver.stride_h = args.kernel.stride_h;
-  (*arg).driver.filter_amount_all = filter_amount_all;
-  (*arg).driver.prog_full_cnt = prog_full_cnt;
-  (*arg).driver.filter_align = args.filter_num / (4 * PE_COLUMN) +
-                               (((args.filter_num % (4 * PE_COLUMN))) ? 1 : 0);
-  (*arg).driver.filter_num = args.filter_num;
-  (*arg).driver.output_width = output_width;
-  (*arg).driver.output_amount_per_row = output_amount_per_row;
-  (*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
-  (*arg).driver.cal_res_num = output_height / ROW_PARALLEL_NUM +
-                              ((output_height % ROW_PARALLEL_NUM) ? 1 : 0) - 1;
-  (*arg).driver.last_cal_res_row_num =
-      (output_height % (ROW_PARALLEL_NUM))
-          ? (output_height % (ROW_PARALLEL_NUM))
-          : (ROW_PARALLEL_NUM);
-
-  (*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
-  (*arg).driver.deconv_skip_row =
-      ROW_PARALLEL_NUM *
-      args.deconv_tx_param.sub_conv_num;  // paralvl*deconv_group
-  (*arg).driver.deconv_res_skip_row =
-      args.deconv_tx_param.sub_conv_num *
-      output_amount_per_row;  // deconv_group * result_amount_per_row
-  (*arg).driver.deconv_ena = args.deconv_tx_param.deconv_en;
-  (*arg).driver.deconv_dump = args.deconv_tx_param.omit_size;
-  (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) +
-                                     args.deconv_tx_param.out_addr_offset;
-  (*arg).driver.output_height = output_height;
-  (*arg).driver.result_amount_per_row_multi_para =
-      output_amount_per_row / RESULT_ALIGNMENT *
-      (args.deconv_tx_param.deconv_en ? (*arg).driver.deconv_skip_row
-                                      : ROW_PARALLEL_NUM);
-  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
-  (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
-  (*arg).driver.filter_amount_whole = filter_amount_all;
-  (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
-  (*arg).driver.filters_amount_whole =
-      filter_amount_all * (*arg).driver.filter_align * (4 * PE_COLUMN);
-  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
-  (*arg).driver.image_hight = args.image.height;
-  (*arg).driver.image_amount_per_row = image_amount_per_row;
-  (*arg).driver.image_amount_per_row_multi_win_first =
-      image_amount_per_row_multi_win_first;
-  (*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
-  (*arg).driver.filter_pad_hight = args.image.pad_height;
-  (*arg).driver.image_block_num = image_block_num;
-  (*arg).driver.image_block_len = image_block_len;
-  (*arg).driver.image_block_len_last = image_block_len_last;
-
-  (*arg).driver.cmd = cmd;
-}  // expand_conv_arg()
-
-void expand_EW_arg(EWAddArgs *arg) {
-  EWAddArgs args = *arg;
-  uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
-  uint64_t datalen = (uint64_t)args.image0.width *
-                     (uint64_t)args.image0.height *
-                     (uint64_t)args.image0.channels;
-  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
-  uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
-  uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
-  uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_amount_per_row_p = align_to_x(
-      (uint64_t)args.image0.width * (uint64_t)args.image0.channels, 16);
-  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
-                               ((uint64_t)args.image0.width << 16) |
-                               (uint64_t)args.image0.height;
-
-  (*arg).driver.image0_address_phy = image0_address_phy;
-  (*arg).driver.image1_address_phy = image1_address_phy;
-  (*arg).driver.datalen = datalen;
-  (*arg).driver.image_image_pixel = image_image_pixel;
-  (*arg).driver.image_amount_per_row =
-      (uint64_t)image_amount_per_row | (uint64_t)(image_amount_per_row_p << 32);
-  (*arg).driver.output_address_phy = output_address_phy;
-  (*arg).driver.coefficient = coefficient;
-  (*arg).driver.cmd = cmd;
-}  // expand_EW_arg
-
-void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
-                    framework::Tensor *out, framework::Tensor *filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float *bs_ptr) {
-  auto input_ptr = input->data<int8_t>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto out_ptr = out->data<int8_t>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  // Either group_num or split_num = 1;
-  PADDLE_MOBILE_ENFORCE(group_num == 1, "group_num is not equal to 1");
-  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-  arg->conv_arg =
-      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
-
-  arg->shared_conv_arg = std::shared_ptr<ConvArgs>(arg->conv_arg, deleter);
-
-  memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs));
-
-  arg->concat_arg.image_num = arg->split_num;
-  arg->concat_arg.image_out = out_ptr;
-  arg->concat_arg.scale_out = out->scale;
-  arg->concat_arg.height = (uint32_t)out->dims()[2];
-  arg->concat_arg.width = (uint32_t)out->dims()[3];
-
-  int n = arg->split_num;
-  arg->concat_arg.images_in =
-      static_cast<int8_t **>(fpga_malloc(n * sizeof(int *)));
-  arg->concat_arg.scales_in =
-      static_cast<float **>(fpga_malloc(n * sizeof(float *)));
-  arg->concat_arg.channel_num =
-      static_cast<uint32_t *>(fpga_malloc(n * sizeof(uint32_t)));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.images_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.scales_in), deleter));
-  arg->vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(arg->concat_arg.channel_num), deleter));
-
-  auto channel = (int)out->dims()[1];  // NOLINT
-  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
-  int element_num = get_aligned_filter_element_num(
-      (int)(filter->dims()[1] * filter->dims()[2] *  // NOLINT
-            filter->dims()[3]));
-
-  for (int i = 0; i < n; i++) {
-    arg->conv_arg[i].relu_enabled = relu_enabled;
-    arg->conv_arg[i].group_num = (uint32_t)group_num;
-    arg->conv_arg[i].kernel.stride_h = (uint32_t)stride_h;
-    arg->conv_arg[i].kernel.stride_w = (uint32_t)stride_w;
-    arg->conv_arg[i].kernel.height = (uint32_t)filter->dims()[2];
-    arg->conv_arg[i].kernel.width = (uint32_t)filter->dims()[3];
-    arg->conv_arg[i].image.address = input_ptr;
-    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
-    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
-    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
-    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
-    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
-    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_arg[i].filter_num = (uint32_t)(
-        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                   : filter_num_per_div);
-
-    size_t filter_size =
-        element_num *
-        align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
-        sizeof(int8_t);
-    auto filter_head = &(
-        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
-    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
-    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
-    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
-
-    size_t bs_size = 2 *
-                     align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
-                     sizeof(float);
-    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
-    arg->vector_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
-    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
-    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
-
-    if (n > 1) {
-      arg->conv_arg[i].output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->conv_arg[i].output.address =
-          fpga_malloc(out->dims()[2] *
-                      align_to_x((int)(out->dims()[3] *  // NOLINT
-                                       arg->conv_arg[i].filter_num),
-                                 IMAGE_ALIGNMENT) *
-                      sizeof(int8_t));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
-          deleter));
-      arg->vector_conv_space.push_back(std::shared_ptr<char>(
-          reinterpret_cast<char *>(arg->conv_arg[i].output.address), deleter));
-    } else {
-      arg->conv_arg[i].output.scale_address = out->scale;
-      arg->conv_arg[i].output.address = out_ptr;
-    }
-
-    arg->concat_arg.images_in[i] =
-        (int8_t *)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = out->scale;
-    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
-
-    expand_conv_arg(&arg->conv_arg[i]);
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_split_arg
-
-void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     bool relu_enabled, int group_num, int stride_h,
-                     int stride_w, int padding_h, int padding_w,
-                     float *bs_ptr) {
-  auto input_ptr = input->data<int8_t>();
-  auto filter_ptr = filter->data<int8_t>();
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)group_num;
-  arg->sub_conv_num = (uint32_t)stride_h;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-  uint32_t sub_conv_num = arg->sub_conv_num;
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
-  fpga::format_int8_ofm(out, dims_out_new);
-  auto out_ptr = out->data<int8_t>();
-  arg->output.address =
-      (int8_t *)out_ptr +  // NOLINT
-      omit_size * sizeof(int8_t) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-  arg->output.scale_address = out->scale;
-
-  uint32_t conv_output_size =
-      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
-      sub_output_height;
-  uint32_t split_num =
-      group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->split_conv_args.push_back(std::make_shared<SplitConvArgs>());
-    arg->split_conv_args[i]->filter_num =
-        (arg->sub_conv_num) * (arg->filter_num);
-    arg->split_conv_args[i]->group_num = (uint32_t)group_num;
-    arg->split_conv_args[i]->split_num = split_num;
-    arg->split_conv_args[i]->concat_arg.height = sub_output_height;
-    arg->split_conv_args[i]->concat_arg.width = sub_output_width;
-    arg->split_conv_args[i]->concat_arg.image_num = split_num;
-
-    arg->split_conv_args[i]->conv_arg =
-        static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
-    arg->split_conv_args[i]->concat_arg.images_in =
-        static_cast<int8_t **>(fpga_malloc(split_num * sizeof(int8_t *)));
-    arg->split_conv_args[i]->concat_arg.scales_in =
-        static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
-    arg->split_conv_args[i]->concat_arg.channel_num =
-        static_cast<uint32_t *>(fpga_malloc(split_num * sizeof(uint32_t)));
-    arg->split_conv_args[i]->shared_conv_arg =
-        std::shared_ptr<ConvArgs>(arg->split_conv_args[i]->conv_arg, deleter);
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.images_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.scales_in),
-            deleter));
-    arg->split_conv_args[i]->vector_concat_space.push_back(
-        std::shared_ptr<char>(
-            reinterpret_cast<char *>(
-                arg->split_conv_args[i]->concat_arg.channel_num),
-            deleter));
-  }
-
-  auto filter_num_per_div =
-      (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w);
-  int element_num = get_aligned_filter_element_num(
-      (int)(sub_channels * sub_filter_width * sub_filter_width));  // NOLINT
-
-  int chw = sub_channels * sub_filter_width * sub_filter_width;
-  int division_capacity = filter::calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_filter_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_filter_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = sub_filter_num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  int filter_sub_conv_offset = element_num * num_after_alignment;
-  uint32_t out_addr_offset = 0;
-  for (int i = 0; i < sub_conv_num; ++i) {
-    if (sub_conv_num == 1) {
-      arg->split_conv_args[i]->output.address = arg->output.address;
-      arg->split_conv_args[i]->output.scale_address = arg->output.scale_address;
-      out_addr_offset = 0;
-
-    } else {
-      out_addr_offset =
-          sizeof(int8_t) * (sub_conv_num - 1 - i) *
-          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
-
-      arg->split_conv_args[i]->output.address = out_ptr;
-      arg->split_conv_args[i]->output.scale_address =
-          static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->output.scale_address),
-              deleter));
-    }
-
-    for (int j = 0; j < split_num; ++j) {
-      // arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type
-      // =
-      //    activation_enable;
-      // arg->split_conv_args[i]
-      //     ->conv_arg[j]
-      //    .output.activation.leaky_relu_negative_slope =
-      //    leaky_relu_negative_slope;
-      arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
-      arg->split_conv_args[i]->conv_arg[j].group_num = (uint32_t)group_num;
-
-      arg->split_conv_args[i]->conv_arg[j].kernel.width =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.height =
-          (uint32_t)sub_filter_width;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_w = 1;
-      arg->split_conv_args[i]->conv_arg[j].kernel.stride_h = 1;
-
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.deconv_en = 1;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.sub_conv_num =
-          sub_conv_num;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.omit_size =
-          omit_size;
-      arg->split_conv_args[i]->conv_arg[j].deconv_tx_param.out_addr_offset =
-          out_addr_offset;
-
-      arg->split_conv_args[i]->conv_arg[j].image.scale_address = input->scale;
-      arg->split_conv_args[i]->conv_arg[j].image.channels =
-          (uint32_t)sub_channels;
-      arg->split_conv_args[i]->conv_arg[j].image.width =
-          (uint32_t)input->dims()[3];
-      arg->split_conv_args[i]->conv_arg[j].image.height =
-          (uint32_t)input->dims()[2];
-      arg->split_conv_args[i]->conv_arg[j].image.pad_width = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.pad_height = (uint32_t)sub_pad;
-      arg->split_conv_args[i]->conv_arg[j].image.address = input_ptr;
-
-      arg->split_conv_args[i]->conv_arg[j].filter_scale_address = filter->scale;
-      arg->split_conv_args[i]->conv_arg[j].filter_num =
-          (uint32_t)(j == split_num - 1
-                         ? sub_filter_num - (split_num - 1) * filter_num_per_div
-                         : filter_num_per_div);
-
-      size_t filter_size =
-          element_num *
-          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
-                     FILTER_NUM_ALIGNMENT) *
-          sizeof(int8_t);
-      auto filter_head = &((
-          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
-                               i * filter_sub_conv_offset];
-      arg->split_conv_args[i]->conv_arg[j].filter_address =
-          fpga_malloc(filter_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].filter_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_head,
-             filter_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
-                 filter_size);
-
-      size_t bs_align_num = align_to_x(
-          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
-      size_t bs_size = 2 * bs_align_num * sizeof(float);
-      auto bs_head = &bs_ptr[j * filter_num_per_div * 2];
-
-      arg->split_conv_args[i]->conv_arg[j].sb_address = fpga_malloc(bs_size);
-      arg->split_conv_args[i]->vector_conv_space.push_back(
-          std::shared_ptr<char>(
-              reinterpret_cast<char *>(
-                  arg->split_conv_args[i]->conv_arg[j].sb_address),
-              deleter));
-
-      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
-      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
-
-      if (split_num == 1) {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            arg->split_conv_args[i]->output.address;
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            arg->split_conv_args[i]->output.scale_address;
-      } else {
-        arg->split_conv_args[i]->conv_arg[j].output.address =
-            fpga_malloc(conv_output_size * sizeof(int8_t));
-        arg->split_conv_args[i]->conv_arg[j].output.scale_address =
-            static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.address),
-                deleter));
-        arg->split_conv_args[i]->vector_conv_space.push_back(
-            std::shared_ptr<char>(
-                reinterpret_cast<char *>(
-                    arg->split_conv_args[i]->conv_arg[j].output.scale_address),
-                deleter));
-      }
-      arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int8_t *>(
-          arg->split_conv_args[i]->conv_arg[j].output.address);
-      arg->split_conv_args[i]->concat_arg.scales_in[j] =
-          arg->split_conv_args[i]->conv_arg[j].output.scale_address;
-      arg->split_conv_args[i]->concat_arg.channel_num[j] =
-          arg->split_conv_args[i]->conv_arg[j].filter_num;
-
-      expand_conv_arg(&(arg->split_conv_args[i]->conv_arg[j]));
-    }
-
-    arg->split_conv_args[i]->concat_arg.image_out =
-        arg->split_conv_args[i]->output.address;
-    arg->split_conv_args[i]->concat_arg.scale_out =
-        arg->split_conv_args[i]->output.scale_address;
-  }
-  filter->reset_data_ptr(nullptr);
-  fpga_free(bs_ptr);
-}  // fill_deconv_arg
-
-void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
-                     framework::Tensor *out, framework::Tensor *filter,
-                     bool relu_enabled, int stride_h, int stride_w,
-                     int padding_h, int padding_w, float *bias_ptr) {
-  auto filter_ptr = filter->data<int16_t>();
-  auto input_ptr = input->data<int8_t>();
-  auto output_ptr = out->data<int8_t>();
-  arg->sub_conv_num = 1;
-  arg->relu_enabled = relu_enabled;
-  // arg->output.activation.activation_type = activation_enable;
-  arg->bias_address = bias_ptr;
-  arg->filter_address = filter_ptr;
-  arg->kernel.height = (uint32_t)filter->dims()[2];
-  arg->kernel.width = (uint32_t)filter->dims()[3];
-  arg->kernel.stride_h = (uint32_t)stride_h;
-  arg->kernel.stride_w = (uint32_t)stride_w;
-  arg->image.address = input_ptr;
-  arg->image.channels = (uint32_t)input->dims()[1];
-  arg->image.height = (uint32_t)input->dims()[2];
-  arg->image.width = (uint32_t)input->dims()[3];
-  arg->image.pad_height = (uint32_t)padding_h;
-  arg->image.pad_width = (uint32_t)padding_w;
-  arg->image.scale_address = input->scale;
-  arg->output.address = output_ptr;
-  arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
-                       framework::Tensor *out, framework::Tensor *filter,
-                       bool relu_enabled, int stride_h, int stride_w,
-                       int padding_h, int padding_w, float *bias_ptr) {
-  auto filter_ptr = filter->data<int8_t>();
-  auto input_ptr = input->data<int8_t>();
-
-  auto deleter = [](void *p) { fpga_free(p); };
-
-  arg->group_num = (uint32_t)filter->dims()[0];
-  arg->sub_conv_num = (uint32_t)stride_w;
-  arg->filter_num = (uint32_t)filter->dims()[0];
-
-  int sub_conv_num = stride_w;
-
-  int sub_pad =
-      deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],  // NOLINT
-                                         padding_w, stride_w);
-  auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis(
-      (int)filter->dims()[3], stride_w);  // NOLINT
-
-  auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[3], sub_pad, sub_filter_width);  // NOLINT
-  auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis(
-      (int)input->dims()[2], sub_pad, sub_filter_width);  // NOLINT
-
-  arg->sub_output_width = (uint32_t)sub_output_width;
-  arg->sub_output_height = (uint32_t)sub_output_height;
-  arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
-      stride_w, (int)filter->dims()[3], padding_w);  // NOLINT
-
-  auto sub_channels = (int)input->dims()[1];  // NOLINT
-  uint32_t omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
-  int sub_filter_num = sub_conv_num * (arg->filter_num);
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {1, arg->filter_num, real_out_height, real_out_width});
-  fpga::format_int8_ofm(out, dims_out_new);
-  auto out_ptr = out->data<int8_t>();
-
-  arg->output.address = out_ptr;
-  arg->output.scale_address = out->scale;
-
-  int filter_offset = sub_filter_width * sub_filter_width *
-                      align_to_x(sub_channels, FILTER_ELEMENT_ALIGNMENT) *
-                      arg->sub_conv_num;
-
-  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->dw_conv_args.push_back(std::make_shared<DWconvArgs>());
-
-    arg->dw_conv_args[i]->sub_conv_num = sub_conv_num;
-    arg->dw_conv_args[i]->relu_enabled = relu_enabled;
-    // arg->dw_conv_args[i]->output.activation.activation_type =
-    // activation_enable;
-    // arg->dw_conv_args[i]->output.activation.leaky_relu_negative_slope =
-    //     leaky_relu_negative_slope;
-    arg->dw_conv_args[i]->bias_address = bias_ptr;
-
-    arg->dw_conv_args[i]->filter_address =
-        fpga_malloc(filter_offset * sizeof(int16_t));
-    memcpy(arg->dw_conv_args[i]->filter_address,
-           (reinterpret_cast<half *>(filter_ptr) + i * filter_offset),
-           filter_offset * sizeof(int16_t));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->filter_address),
-        deleter));
-
-    arg->dw_conv_args[i]->kernel.height = (uint32_t)sub_filter_width;
-    arg->dw_conv_args[i]->kernel.width = (uint32_t)sub_filter_width;
-
-    arg->dw_conv_args[i]->kernel.stride_h = (uint32_t)1;
-    arg->dw_conv_args[i]->kernel.stride_w = (uint32_t)1;
-    arg->dw_conv_args[i]->image.address = input_ptr;
-    arg->dw_conv_args[i]->image.channels = (uint32_t)input->dims()[1];
-    arg->dw_conv_args[i]->image.height = (uint32_t)input->dims()[2];
-    arg->dw_conv_args[i]->image.width = (uint32_t)input->dims()[3];
-
-    arg->dw_conv_args[i]->image.pad_height = sub_pad;
-    arg->dw_conv_args[i]->image.pad_width = sub_pad;
-    arg->dw_conv_args[i]->image.scale_address = input->scale;
-
-    arg->dw_conv_args[i]->output.address =
-        fpga_malloc(sub_output_height *
-                    align_to_x(sub_output_width * sub_channels * sub_conv_num,
-                               IMAGE_ALIGNMENT) *
-                    sizeof(int8_t));
-    arg->dw_conv_args[i]->output.scale_address =
-        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
-        deleter));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
-        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
-        deleter));
-  }
-
-  // arg->output.scale_address = out->scale;
-}  // end dwconv arg fill
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/api.h b/mobile/src/fpga/V2/api.h
deleted file mode 100644
index d8674c440171cfc71db92dddf4e9d3766e42a037..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/api.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "fpga/common/fpga_common.h"
-#include "fpga/common/pe.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-void format_image(framework::Tensor* image_tensor);
-void format_ofm(framework::Tensor* ofm_tensor);
-void format_int8_ofm(framework::Tensor* ofm_tensor);
-void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
-void format_fp32_ofm(framework::Tensor* ofm_tensor);
-
-float filter_find_max(framework::Tensor* filter_tensor);
-int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
-int get_deconv_filter_num_per_div(framework::Tensor* filter_tensor,
-                                  int group_num, int stride);
-
-int get_plit_num(framework::Tensor* filter_tensor);
-int get_deconv_plit_num(framework::Tensor* filter_tensor, int stride);
-
-int get_aligned_filter_element_num(int chw);
-void format_filter(framework::Tensor* filter_tensor, float max_value,
-                   int group_num);
-void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-void format_concat_output(framework::Tensor* out, int height, int width,
-                          int image_num, uint32_t* channel_num);
-
-void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
-                    framework::Tensor* out, framework::Tensor* filter,
-                    bool relu_enabled, int group_num, int stride_h,
-                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
-void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     bool relu_enabled, int group_num, int stride_h,
-                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
-void fill_dwconv_arg(struct DWconvArgs* arg, framework::Tensor* input,
-                     framework::Tensor* out, framework::Tensor* filter,
-                     bool relu_enabled, int stride_h, int stride_w,
-                     int padding_h, int padding_w, float* bias_ptr);
-void fill_DWDeconv_arg(struct DWDeconvArgs* arg, framework::Tensor* input,
-                       framework::Tensor* out, framework::Tensor* filter,
-                       bool relu_enabled, int stride_h, int stride_w,
-                       int padding_h, int padding_w, float* bs_ptr);
-
-void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
-                          int group_num, int stride);
-void format_dwconv_filter(framework::Tensor* filter_tensor, float* scale_ptr);
-void format_conv_data(framework::Tensor* filter_tensor,
-                      framework::Tensor* ofm_tensor, float** bs_ptr, int group);
-void format_deconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float** bs_ptr,
-                        int group, int sub_conv_n);
-void format_dwconv_data(framework::Tensor* filter_tensor,
-                        framework::Tensor* ofm_tensor, float* scale_ptr,
-                        float** bias_ptr);
-void format_DWDeconv_data(framework::Tensor* filter_tensor,
-                          framework::Tensor* ofm_tensor, float** bs_ptr,
-                          int group, int sub_conv_n);
-
-template <typename Dtype>
-void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
-  float data;
-  std::ofstream out(filename.c_str());
-  for (int i = 0; i < dataSize; ++i) {
-    data = (((Dtype*)buffer)[i]);  // NOLINT
-    out << data << std::endl;
-  }
-  out.close();
-  return;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/bias_scale.cpp b/mobile/src/fpga/V2/bias_scale.cpp
deleted file mode 100644
index 44722ef59af7997684a05ad3a6edc2e277af7be4..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/bias_scale.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/bias_scale.h"
-#include <math.h>
-#include <memory.h>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float **data_in, int num_per_div_before_alignment, int num) {
-  int copynum = 0;
-  float *ptr_unaligned = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
-  int num_element =
-      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned =
-      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
-
-  memset(ptr_aligned, 0, num_element * sizeof(float));
-
-  for (int i = 0; i < div_num; i++) {
-    if (i == div_num - 1) {
-      copynum = (num_per_div_after_alignment * div_num > num)
-                    ? (num % num_per_div_after_alignment)
-                    : (num_per_div_before_alignment);
-    } else {
-      copynum = num_per_div_before_alignment;
-    }
-
-    memcpy(ptr_aligned + i * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i,
-           copynum * sizeof(float));
-    memcpy(ptr_aligned + (div_num + i) * num_per_div_after_alignment,
-           ptr_unaligned + num_per_div_before_alignment * i + num,
-           copynum * sizeof(float));
-  }
-
-  fpga_free(ptr_unaligned);
-  *data_in = ptr_aligned;
-}
-
-void fixed_scale_bias_new(void *data_in, int data_len) {
-  int *data_tmp = static_cast<int *>(data_in);
-  for (int idx = 0; idx < data_len / 2; ++idx) {
-    float tmp = (static_cast<float *>(data_in))[idx];
-    data_tmp[idx] = static_cast<int>(round(tmp * pow(2.0, 23.0)));
-    tmp = (static_cast<float *>(data_in))[idx + data_len / 2];
-    data_tmp[idx + data_len / 2] =
-        static_cast<int>(round(tmp * pow(2.0, 30.0)));
-  }
-  return;
-}
-
-void interleave(float **data_in, int num_after_alignment) {
-  // num_after_alignment: number of bias after alignment
-
-  float *ptr_uninterleaved = *data_in;
-  // fixed_scale_bias_new(ptr_uninterleaved, 2 * num_after_alignment);
-  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
-  int num = num_after_alignment / 4;
-  for (int i = 0; i < num; i++) {
-    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,
-           4 * sizeof(float));
-    memcpy(ptr_interleaved + 8 * i + 4,
-           ptr_uninterleaved + num_after_alignment + 4 * i, 4 * sizeof(float));
-  }
-
-  fpga_free(ptr_uninterleaved);
-  *data_in = ptr_interleaved;
-}
-
-void format_bias_scale_array(float **bias_scale_array,
-                             int element_num_per_division, int num) {
-  align_element(bias_scale_array, element_num_per_division, num);
-  int div_num = (num + element_num_per_division - 1) / element_num_per_division;
-  int element_num_after_division =
-      align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
-}
-void format_bias_array(float **bias_array, int num) {
-  float *ptr_unaligned = *bias_array;
-  int num_before_align = num;
-  int num_after_align = align_to_x(num_before_align, BIAS_NUM_ALIGNMENT);
-  int16_t *ptr_aligned =
-      (int16_t *)fpga_malloc(num_after_align * sizeof(int16_t));  // NOLINT
-
-  memset(ptr_aligned, 0, num_after_align * sizeof(int16_t));
-  for (int i = 0; i < num_before_align; i++) {
-    ptr_aligned[i] = fp32_2_fp16(ptr_unaligned[i]);
-  }
-  *bias_array = (float *)ptr_aligned;  // NOLINT
-  fpga_free(ptr_unaligned);
-}
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/bias_scale.h b/mobile/src/fpga/V2/bias_scale.h
deleted file mode 100644
index 9ebdc71bce1df1bd15b4be395de18c57f5ed3c09..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/bias_scale.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace bias_scale {
-
-void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
-void format_bias_scale_array(float** bias_scale_array,
-                             int element_num_per_division, int num);
-void format_bias_array(float** bias_array, int num);
-
-}  // namespace bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_bias_scale.cpp b/mobile/src/fpga/V2/deconv_bias_scale.cpp
deleted file mode 100644
index f88e1a773873db146b9c8fe1e948ef4b24f0177d..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/deconv_bias_scale.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/deconv_bias_scale.h"
-// #include "deconv_bias_scale.h"
-#include "fpga/V2/bias_scale.h"
-// #include "bias_scale.h"
-// #include <memory.h>
-
-#include "fpga/V2/api.h"
-// #include "fpga_api.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n) {
-  int sub_num = num * sub_conv_n;
-  float* ptr_tmp = *bias_scale_array;
-  float* ptr_bias_scale_expand =
-      reinterpret_cast<float*>(fpga_malloc(sizeof(float) * sub_num * 2));
-  int scale_base_offset = sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    int offset = num * i;
-    // copy bias
-    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
-    // copy scale
-    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
-              num * sizeof(float));
-  }
-  *bias_scale_array = ptr_bias_scale_expand;
-  fpga_free(ptr_tmp);
-}
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_bias_scale.h b/mobile/src/fpga/V2/deconv_bias_scale.h
deleted file mode 100644
index 820c6984d439f945ea4fc5f560fb346869026003..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/deconv_bias_scale.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_bias_scale {
-
-void deconv_bias_scale_expand(float** bias_scale_array, int num,
-                              int sub_conv_n);
-
-}  // namespace deconv_bias_scale
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_filter.cpp b/mobile/src/fpga/V2/deconv_filter.cpp
deleted file mode 100644
index 5ed9786f19346230ea7863da3807a1c19588b68a..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/deconv_filter.cpp
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/deconv_filter.h"
-#include <memory.h>
-#include <algorithm>
-// #include "deconv_filter.h"
-#include "fpga/V2/filter.h"
-// #include "filter.h"
-#include "fpga/V2/api.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-/*
-inverse kernel weights of each channel for every filter
-*/
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height) {
-  float* tmp = *data_in;
-  int data_size = num * channel * width * height;
-  int hw_len = height * width;
-  auto tmp_data =
-      reinterpret_cast<float*>(fpga_malloc(data_size * sizeof(float)));
-  for (int i = 0; i < num; ++i) {
-    for (int j = 0; j < channel; ++j) {
-      for (int k = 0; k < hw_len; ++k) {
-        tmp_data[i * channel * hw_len + j * hw_len + k] =
-            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
-      }
-    }
-  }
-  *data_in = tmp_data;
-  fpga_free(tmp);
-}
-
-/*
-    calculate sub padding number
-*/
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
-  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
-    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
-  }
-  return (filter_axis - pad - 1) / stride;
-}
-int deconv_get_sub_filter_axis(int filter_axis, int stride) {
-  return (filter_axis / stride);
-}
-
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
-  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
-}
-
-/*
-    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
-   position. so the omit rows or columns is (stride - )
-*/
-int deconv_get_omit(int stride, int filter_width, int pad) {
-  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
-  int idx;
-  bool flag = false;
-  for (idx = 1; idx <= stride; ++idx) {
-    int j = idx;
-    for (; j <= filter_width;) {
-      if (j == filter_width - pad) {
-        flag = true;
-        break;
-      }
-      j = j + stride;
-    }
-    if (flag) {
-      break;
-    }
-  }
-
-  return (stride - idx);
-}
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel) {
-  T* ptr_tmp = *data_in;
-  int sub_num = kernel_num * sub_conv_n;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-
-  int sub_filter_size =
-      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
-
-  T* ptr_sub_filter =
-      reinterpret_cast<T*>(fpga_malloc(sub_filter_size * sizeof(T)));
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    for (int nn = 0; nn < sub_num; ++nn) {
-      int ni = nn % kernel_num;
-
-      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
-
-      for (int hh = 0; hh < sub_h; ++hh) {
-        int hi = hh * sub_conv_n + idx % sub_conv_n;
-        for (int ww = 0; ww < sub_w; ++ww) {
-          int wi = ww * sub_conv_n + woff;  // 1 0
-
-          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
-          int kidx = ((ni * height + hi) * width + wi) * channel;  //
-
-          fpga_copy(
-              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
-              (*data_in) + kidx, channel * sizeof(T));
-          // for (int cc =0; cc < channel; ++cc) {
-          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
-          //     (*data_in)[kidx + cc];
-          // }
-        }
-      }
-    }
-  }
-  *data_in = ptr_sub_filter;
-  fpga_free(ptr_tmp);
-}
-
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
-                       int hw) {
-  float* tmp = *filter_in;
-  float* ptr_filter = reinterpret_cast<float*>(paddle_mobile::fpga::fpga_malloc(
-      hw * kernel_num * channels * sizeof(float)));
-
-  for (int c = 0; c < channels; ++c) {
-    for (int n = 0; n < kernel_num; ++n) {
-      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
-                                     tmp + n * channels * hw + c * hw,
-                                     hw * sizeof(float));
-    }
-  }
-  *filter_in = ptr_filter;
-  paddle_mobile::fpga::fpga_free(tmp);
-}
-
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride) {
-  int data_size = channel * height * width * num;
-
-  /*{
-       float result2 = (float)0;
-       string filename = "origin_filter_data";
-       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-    }*/
-
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  /* {
-          float result2 = (float)0;
-          string filename = "inverse_filter_data";
-          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
-   }*/
-
-  filter::quantize(data_in, data_size, max);
-  /* {
-        char result2 = (char)0;
-        string filename = "quantize_filter_data";
-        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
- }*/
-  char** quantize_data = (char**)data_in;  // NOLINT
-
-  filter::convert_to_hwc(quantize_data, num, channel, height, width);
-  /*{
-       char result2 = (char)0;
-       string filename = "convert_to_hwc_filter_data";
-       api::savefile<char>(filename, (void *)*quantize_data, data_size,
-  result2);
-  }*/
-
-  deconv_get_sub_filter<char>(quantize_data, height, width, stride, num,
-                              channel);
-  /*{
-     char result2 = (char)0;
-     string filename = "sub_filter_filter_data";
-     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
-}*/
-
-  int sub_conv_n = stride;
-  int sub_h = height / sub_conv_n;
-  int sub_w = width / sub_conv_n;
-  int sub_chw = sub_h * sub_w * channel;
-  int sub_num = sub_conv_n * num;
-  int division_capacity = filter::calc_division_capacity(sub_chw);
-  int num_per_div_before_alignment =
-      filter::calc_num_per_div(sub_num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num = (sub_num + num_per_div_before_alignment - 1) /
-                num_per_div_before_alignment;
-  int residual = (sub_num) % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  char** ptr_ptr_data =
-      reinterpret_cast<char**>(fpga_malloc(sub_conv_n * sizeof(char*)));
-  int origin_offset = sub_chw * sub_num;
-  for (int i = 0; i < sub_conv_n; ++i) {
-    (ptr_ptr_data)[i] =
-        reinterpret_cast<char*>(fpga_malloc(origin_offset * sizeof(char)));
-    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
-              origin_offset * sizeof(char));
-
-    /* char result2 = (char)0;
-     string filename = "ptr_ptr_data" + to_string(i);
-     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
-     result2);
-     */
-  }
-  // char result2 = (char)0;
-  //      string filename = "interleave";
-  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
-  //      result2);
-  fpga_free(*quantize_data);
-
-  int align_offset =
-      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
-  char* ptr_space = reinterpret_cast<char*>(fpga_malloc(
-      sub_conv_n * align_offset * sizeof(char)));  // continuous space
-  for (int i = 0; i < sub_conv_n; ++i) {
-    char* ptr_tmp = (ptr_ptr_data)[i];
-
-    filter::align_element(&ptr_tmp, sub_num, sub_chw);
-    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
-
-    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
-    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
-
-    /*   char result2 = (char)0;
-       string filename = "interleave" + to_string(i);
-       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
-*/
-    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
-    fpga_free(ptr_tmp);
-  }
-  fpga_free(ptr_ptr_data);
-  *data_in = reinterpret_cast<float*>(ptr_space);
-
-  /*    {
-        char result2 = (char)0;
-         string filename = "ptr_space";
-         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
-     align_offset, result2);
-      }*/
-  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
-}
-
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride) {
-  deconv_inverse_filter(data_in, num, channel, width, height);
-
-  filter::quantize_to_fp16(data_in, channel, height, width, scale_ptr);
-  int16_t** quantize_data = (int16_t**)data_in;  // NOLINT
-  filter::convert_to_hwn(quantize_data, channel, height, width);
-
-  deconv_get_sub_filter<int16_t>(quantize_data, height, width, stride, num,
-                                 channel);
-
-  filter::align_element_n(quantize_data, channel, height, width);
-  fpga_flush(*quantize_data, align_to_x(channel, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/deconv_filter.h b/mobile/src/fpga/V2/deconv_filter.h
deleted file mode 100644
index f1a50b95c52dadc49f4dd333791a22f63bf6d0a3..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/deconv_filter.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace fpga {
-namespace deconv_filter {
-
-void deconv_inverse_filter(float** data_in, int num, int channel, int width,
-                           int height);
-int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
-int deconv_get_sub_filter_axis(int filter_axis, int stride);
-int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
-int deconv_get_omit(int stride, int filter_width, int pad);
-
-template <typename T>
-void deconv_get_sub_filter(T** data_in, int height, int width, int sub_conv_n,
-                           int kernel_num, int channel);
-void deconv_format_filter(float** data_in, int num, int channel, int height,
-                          int width, int group_num, float max, int stride);
-void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
-void DWDconv_format_filter(float** data_in, int num, int channel, int height,
-                           int width, float* scale_ptr, int stride);
-
-}  // namespace deconv_filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/filter.cpp b/mobile/src/fpga/V2/filter.cpp
deleted file mode 100644
index a281a7335c33bd882a274033c62ea66b583f8cd0..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/filter.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/filter.h"
-#include <memory.h>
-#include <algorithm>
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw) {
-  int n = 2048 / ((chw + 15) / 16) * 32;
-  return n < 2048 ? n : 2048;
-}
-
-int calc_split_num(int num, int division_capacity) {
-  return (num + division_capacity - 1) / division_capacity;
-}
-
-int calc_division_number(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  return group_num * split_num;
-}
-
-int calc_num_per_div(int num, int group_num, int division_capacity) {
-  //  PADDLE_MOBILE_ENFORCE(num % group_num == 0,
-  //                        "Filter number should be divisible by group
-  //                        number");
-  int split_num = calc_split_num(num, division_capacity);
-  //  PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1,
-  //                        "Split number or group number should be 1");
-  if (group_num == 1) {
-    if (num > division_capacity) {
-      return division_capacity;
-    } else {
-      return num;
-    }
-  } else {
-    return (num + group_num - 1) / group_num;
-  }
-}
-
-void convert_to_hwc(char **data_in, int num, int channel, int height,
-                    int width) {
-  char *tmp = *data_in;
-  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
-        }
-      }
-    }
-  }
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-float find_max(float *data_in, int data_size) {
-  float max = 0.0;
-  for (int i = 0; i < data_size; ++i) {
-    float value = data_in[i];
-    float abs = value > 0 ? value : -value;
-    max = std::max(max, abs);
-  }
-  return max;
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
-  float fix_range = 127;
-  float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void align_element(char **data_in, int num, int chw) {
-  int i = 0;
-  int j = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-
-void align_num(char **data_in, int num_per_div_before_alignment, int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-
-  char *tmp = *data_in;
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
-
-  for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
-           num_per_div_before_alignment * align_chw);
-  }
-
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
-         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void reorder(char **data_in, int num_after_alignment, int chw) {
-  int index = 0;
-  int new_index;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align, *data_in + new_index * chw_align,
-           chw_align);
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int interleave_per_num = 16;
-
-  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
-  int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
-             interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
-             interleave_per_num);
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_filter(float **data_in, int num, int channel, int height, int width,
-                   int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-
-void convert_fc_filter(char **data_in, int num, int chw) {
-  char *tmp = *data_in;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < chw; c++) {
-      data_tmp[n * chw + c] = (*data_in)[num * c + n];
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void format_fc_filter(float **data_in, int num, int channel, int height,
-                      int width, int group_num, float max) {
-  int data_size = channel * height * width * num;
-  int chw = channel * height * width;
-
-  int division_capacity = calc_division_capacity(chw);
-  int num_per_div_before_alignment =
-      calc_num_per_div(num, group_num, division_capacity);
-  int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  int div_num =
-      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int residual = num % num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment *
-                                ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_fc_filter(quantize_data, num, chw);
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
-  }
-  reorder(quantize_data, num_after_alignment, chw);
-  interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
-                                 num_after_alignment * sizeof(char));
-}
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        *(data_tmp + h * width * num + w * num + n) = *((*data_in)++);
-      }
-    }
-  }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-}
-
-void align_element_n(int16_t **data_in, int num, int height, int width) {
-  int unalign_n = num;
-  int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
-  if (unalign_n == align_n) {
-    return;
-  } else {
-    int16_t *tmp = *data_in;
-
-    int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
-
-    memset(data_tmp, 0, num_element * sizeof(int16_t));
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int offset_unalign = h * width * unalign_n + w * unalign_n;
-        int offset_align = h * width * align_n + w * align_n;
-        for (int n = 0; n < unalign_n; n++) {
-          data_tmp[offset_align + n] = *((*data_in) + offset_unalign + n);
-        }
-      }
-    }
-
-    *data_in = data_tmp;
-    fpga_free(tmp);
-  }
-}
-void quantize_to_fp16(float **data_in, int num, int height, int width,
-                      float *scale_ptr) {
-  float *tmp = *data_in;
-  int size = num * height * width;
-
-  int16_t *tmp_data = (int16_t *)fpga_malloc(size * sizeof(int16_t));  // NOLINT
-  for (int n = 0; n < num; n++) {
-    float scale_val = scale_ptr[n];
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        int index = n * height * width + h * width + w;
-        tmp_data[index] = fp32_2_fp16((*data_in)[index] * scale_val);
-      }
-    }
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-void format_dwconv_filter(float **data_in, int num, int height, int width,
-                          float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-
-void format_DWDeconv_filter(float **data_in, int num, int height, int width,
-                            float *scale_ptr) {
-  quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
-  convert_to_hwn(quantize_data, num, height, width);
-  align_element_n(quantize_data, num, height, width);
-  fpga_flush(*quantize_data, align_to_x(num, FILTER_ELEMENT_ALIGNMENT) *
-                                 height * width * sizeof(int16_t));
-}
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/filter.h b/mobile/src/fpga/V2/filter.h
deleted file mode 100644
index 4812a75af2af97047f4b46a5dc7fdb9dfa11b456..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/filter.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstdint>
-namespace paddle_mobile {
-namespace fpga {
-namespace filter {
-
-int calc_division_capacity(int chw);
-int calc_split_num(int num, int division_capacity);
-int calc_division_number(int num, int group_num, int division_capacity);
-int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(char** data_in, int num, int channel, int height,
-                    int width);
-float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in, int num_per_div_before_alignment, int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-void interleave(char** data_in, int num_after_alignment, int chw);
-void format_filter(float** data_in, int num, int channel, int height, int width,
-                   int group_num, float max);
-
-void convert_fc_filter(char** data_in, int num, int chw);
-void format_fc_filter(float** data_in, int num, int channel, int height,
-                      int width, int group_num, float max);
-
-void convert_to_hwn(int16_t** data_in, int num, int height, int width);
-void align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(float** data_in, int num, int height, int width,
-                      float* scale_ptr);
-void format_dwconv_filter(float** data_in, int num, int height, int width,
-                          float* scale_ptr);
-
-}  // namespace filter
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp
deleted file mode 100644
index eda7837bd087483a91746260509060e34780696b..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/image.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/V2/image.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp = reinterpret_cast<float *>(
-      fpga_malloc(num * channel * height * width * sizeof(float)));
-  int64_t amount_per_row = width * channel;
-  for (int n = 0; n < num; n++) {
-    for (int c = 0; c < channel; c++) {
-      for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
-        for (int w = 0; w < width; w++) {
-          *(data_tmp + n * channel * height * width + offset_height +
-            w * channel + c) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    int num) {
-  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * height * width * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-  *data_in = data_tmp;
-}
-
-void concat_images(int8_t **images_in, float **scales_in, void *image_out,
-                   float *scale_out, int image_num, uint32_t *channel_num,
-                   int height, int width) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
-  int each_out_line_channel = 0;
-  int align_each_out_area_cw = 0;
-  int align_each_in_area_cw = 0;
-  int align_each_out_area_cw_differ = 0;
-  int tmp_channel = 0;
-  float Ck = 0.0f;
-  float So = scale_out[0];
-  auto images_in_tmp =
-      (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *));  // NOLINT
-  for (i = 0; i < image_num; i++) {
-    images_in_tmp[i] = reinterpret_cast<int8_t *>(fpga::fpga_malloc(
-        height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-        sizeof(int8_t)));
-  }
-  for (i = 0; i < image_num; i++) {
-    each_out_line_channel += channel_num[i];
-    float Si_k = scales_in[i][0];
-    Ck = Si_k / So;
-    fpga_invalidate(images_in[i],
-                    height *
-                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
-                        sizeof(int8_t));
-  }
-  align_each_out_area_cw =
-      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
-  align_each_out_area_cw_differ =
-      align_each_out_area_cw - each_out_line_channel * width;
-
-  for (k = 0; k < height; k++) {
-    for (j = 0; j < width; j++) {
-      for (i = 0; i < image_num; i++) {
-        align_each_in_area_cw =
-            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int8_t *)image_out + tmp_channel +  // NOLINT
-                   k * align_each_out_area_cw_differ,
-               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
-               channel_num[i] * sizeof(int8_t));
-
-        tmp_channel += channel_num[i];
-      }
-    }
-  }
-  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
-  for (i = 0; i < image_num; i++) {
-    fpga_free(images_in_tmp[i]);
-  }
-  fpga_free(images_in_tmp);
-}
-
-void split_image(int8_t *image_in, void **images_out, int image_num,
-                 const uint32_t *channel_nums, int height, int width) {
-  int total_channel = 0;
-  for (int i = 0; i < image_num; i++) {
-    total_channel += channel_nums[i];
-  }
-  int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
-  fpga_invalidate(image_in, element_num * sizeof(int8_t));
-  int src_offset = 0, des_offset = 0;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
-                   w * total_channel;
-      for (int i = 0; i < image_num; i++) {
-        des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
-                     w * channel_nums[i];
-        memcpy(reinterpret_cast<int8_t *>(images_out[i]) + des_offset,
-               image_in + src_offset, channel_nums[i] * sizeof(int8_t));
-        src_offset += channel_nums[i];
-      }
-    }
-  }
-
-  for (int i = 0; i < image_num; i++) {
-    element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
-    fpga_flush(images_out[i], element_num * sizeof(int8_t));
-  }
-}
-
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/image.h b/mobile/src/fpga/V2/image.h
deleted file mode 100644
index 11988ee11d070c6d91a79cdd682c3c2bc2f84570..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/image.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory.h>
-#include <algorithm>
-#include <cstdint>
-#include "fpga/common/fpga_common.h"
-namespace paddle_mobile {
-namespace fpga {
-namespace image {
-
-void convert_to_hwc(float** data_in, int channel, int height, int width,
-                    int num = 1);
-void convert_to_chw(float** data_in, int channel, int height, int width,
-                    int num = 1);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw);
-template <typename Dtype>
-void align_element_conv(Dtype** data_in, int height, int cw) {
-  int h = 0;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-
-  Dtype* data_tmp =
-      (Dtype*)fpga_malloc(height * align_cw * sizeof(Dtype));  // NOLINT
-
-  memset(data_tmp, 0, height * align_cw * sizeof(Dtype));
-
-  for (h = 0; h < height; h++) {
-    memcpy((void*)(data_tmp + h * align_cw),  // NOLINT
-           (void*)(*data_in + h * cw),        // NOLINT
-           cw * sizeof(Dtype));
-  }
-
-  *data_in = data_tmp;
-}
-template <typename T>
-void format_image(T** data_in, int channel, int height, int width) {
-  int cw = channel * width;
-  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
-  if (align_cw != cw) {
-    T* hwc_temp = *data_in;
-    align_element_conv(data_in, height, channel * width);
-    fpga_free(hwc_temp);
-  }
-  fpga_flush(*data_in,
-             align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
-}
-// Concat featuremaps along channel direction
-void concat_images(int8_t** images_in, float** scales_in, void* image_out,
-                   float* scale_out, int image_num, uint32_t* channel_num,
-                   int height, int width);
-
-// Split featuremap along channel direction
-void split_image(int8_t* image_in, void** images_out, int image_num,
-                 const uint32_t* channel_nums, int height, int width);
-}  // namespace image
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp
deleted file mode 100644
index 585ab6706e0de60212aa87889568d9b4f8e0530c..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/V2/pe.cpp
+++ /dev/null
@@ -1,1138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/pe.h"
-#include "common/enforce.h"
-#include "common/types.h"
-#include "fpga/V2/filter.h"
-#include "fpga/V2/image.h"
-#include "fpga/common/config.h"
-#include "fpga/common/driver.h"
-#include "fpga/common/fpga_common.h"
-#ifdef COST_TIME_PRINT
-#include <sys/time.h>
-#include <time.h>
-#include <iomanip>
-#include <iostream>
-#endif
-
-namespace paddle_mobile {
-namespace fpga {
-
-using namespace driver;  // NOLINT
-using namespace std;     // NOLINT
-#define USE_RELU 1
-#define USE_BIAS 2
-
-// bypass cmd
-#define CMD_FP16_TO_FP16 0
-#define CMD_FP16_TO_FP32 1
-#define CMD_FP32_TO_FP16 2
-#define CMD_FP32_TO_FP32 3
-#define CMD_INT8_TO_FP16 4
-
-// bypass macro
-#define SIZE_FP16 2
-#define SIZE_FP32 4
-#define SIZE_INT8 1
-
-#define PE_IRQ_TIMEOUT 1000000
-
-/* Interrupt bit-set offset*/
-#define INTERRUPT_RSVD 0x0001
-#define INTERRUPT_BYPASS 0x0002
-#define INTERRUPT_CONV 0x0004
-#define INTERRUPT_POOLING 0x0008
-#define INTERRUPT_EW 0x0010
-
-/* Register offset */
-#define REG_INTERRUPT 0x000
-#define REG_VERSION 0x008
-#define REG_TEMPERATURE 0x010
-#define REG_FPGA_RESET 0x018
-#define REG_TEST_REGISTER 0x048
-#define REG_HARDWARE_STATUS 0x050
-
-#define REG_TIMER_COUNTER 0x070
-
-#define REG_SCALE_PARAMETER 0x080
-#define REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR 0x090
-
-#define REG_FLASH_CMD 0x200
-#define REG_FLASH_DATA 0x208
-#define REG_FLASH_CONFIG 0x210
-#define REG_FLASH_STATUS 0x218
-#define REG_SN 0x220
-
-/*bypass*/
-#define REG_CONVERT_CMD 0x400
-#define REG_CONVERT_SRC_ADDR 0x408
-#define REG_CONVERT_DST_ADDR 0x410
-#define REG_CONVERT_RD_LENGTH 0x418
-#define REG_CONVERT_WR_LENGTH 0x420
-
-/*resize*/
-#define REG_RESIZE_CMD 0x600
-#define REG_RESIZE_CHANNEL_NUMBER 0x608
-#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
-#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
-#define REG_RESIZE_INPUT_BASE_ADDR 0x620
-#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
-#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
-#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
-
-/*pooling*/
-#define REG_POOLING_CMD 0x800
-#define REG_POOLING_IMAGE_BASE_ADDR 0x808
-#define REG_POOLING_RESULT_BASE_ADDR 0x810
-#define REG_POOLING_IMAGE_PIXEL 0x818
-#define REG_POOLING_WINDOW_SIZE 0x820
-#define REG_POOLING_RESULT_PIXEL 0x828
-#define REG_POOLING_PAD_PIXEL 0x830
-#define REG_POOLING_STEP_PIXEL 0x838
-#define REG_POOLING_CHANNEL_NUMBER 0x840
-#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
-#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
-#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
-#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
-#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
-#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880
-#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
-#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
-#define REG_POOLING_MODE_RECIPROCAL 0x890
-
-/*conv*/
-#define REG_CONV_CMD 0xC00
-#define REG_CONV_REG0 0xC08
-#define REG_CONV_REG1 0xC10
-#define REG_CONV_REG2 0xC18
-#define REG_CONV_REG3 0xC20
-#define REG_CONV_REG4 0xC28
-#define REG_CONV_REG5 0xC30
-#define REG_CONV_REG6 0xC38
-#define REG_CONV_REG7 0xC40
-#define REG_CONV_REG8 0xC48
-#define REG_CONV_REG9 0xC50
-#define REG_CONV_REG10 0xC58
-#define REG_CONV_REG11 0xC60
-
-#define REG_CONV_IMAGE_BASE_ADDR 0xC08
-#define REG_CONV_FILTER_BASE_ADDR 0xC10
-#define REG_CONV_SB_BASE_ADDR 0xC18
-#define REG_CONV_RESULT_BASE_ADDR 0xC20
-#define REG_CONV_IMAGE_PIXEL 0xC28
-#define REG_CONV_FILTER_PIXEL 0xC30
-#define REG_CONV_RESULT_PIXEL 0xC38
-#define REG_CONV_PAD_PIXEL 0xC40
-#define REG_CONV_STEP_PIXEL 0xC48
-#define REG_CONV_GROUP_NUMBER 0xC50
-#define REG_CONV_FILTER_NUMBER 0xC58
-#define REG_CONV_CHANNEL_NUMBER 0xC60
-#define REG_CONV_FILTER_PER_GROUP 0xC68
-#define REG_CONV_CHANNEL_PER_GROUP 0xC70
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
-#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
-#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
-#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
-#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
-#define REG_CONV_RESULT_LAST_VALID 0xCA0
-
-#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
-#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
-#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
-#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
-#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
-#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
-#define REG_CONV_IMAGE_WIN_CNT 0xCE0
-#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
-#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
-#define REG_CONV_PROG_FULL_CNT 0xD08
-#define REG_CONV_POST_PROG_FULL_CNT 0xD10
-#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
-
-#define REG_CONV_IMAGE_SCALE 0xD28
-#define REG_CONV_FILTER_SCALE 0xD30
-
-/*ew*/
-#define REG_EW_CMD 0x0F00
-#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
-#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
-#define REG_EW_RESULT_BASE_ADDR 0x0F18
-#define REG_EW_DATA_LEN 0x0F20
-#define REG_EW_COEFFICIENT 0x0F28
-#define REG_EW_IMAGE_PIXEL 0x0F30
-#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
-
-/*dwconv*/
-#define REG_DWCONV_FILTER_BASE_ADDR 0xe08
-#define REG_DWCONV_FILTER_SHAPE 0xe10
-#define REG_DWCONV_FILTER_N_ALIGN 0xe18
-#define REG_DWCONV_FILTER_SUBNUMBER 0xe20
-#define REG_DWCONV_CMD 0xe00
-
-int ComputeFpgaConv(const struct SplitConvArgs &args) {
-//  ComputeBasicConv(args.conv_arg[0]);
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGAConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num
-       << "   split_num:" << args.split_num;
-#endif
-  int ret = 0;
-  int split_num = args.split_num;
-  for (int i = 0; i < split_num; i++) {
-    ret |= ComputeBasicConv(args.conv_arg[i]);
-  }
-
-  if (split_num > 1) {
-    ComputeFPGAConcat(args.concat_arg);
-  }
-
-  return ret;
-}
-
-int ComputeBasicConv(const struct ConvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "======Compute Basic Conv======";
-  DLOG << "   relu_enabled:" << args.relu_enabled;
-  DLOG << "   sb_address:" << args.sb_address
-       << "   filter_address:" << args.filter_address
-       << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t output_scale = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  // ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-
-  // active_args.activation_type = args.output.activation.activation_type;
-
-  // active_args.leaky_relu_negative_slope =
-  //    args.output.activation.leaky_relu_negative_slope;
-
-  // reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                     active_args.leaky_relu_negative_slope;
-
-  // DLOG << "   activation_type:" << active_args.activation_type
-  //     << "   leaky_relu_negative_slope:"
-  //     << active_args.leaky_relu_negative_slope;
-  DLOG << "   reg_ActivationArgs:";
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  // reg_writeq(reg_ActivationArgs,
-  // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  // new
-  reg_writeq((args.driver.row_padding_down << 45) |
-                 (args.driver.row_padding_up << 34) |
-                 (args.driver.col_padding_down << 17) |
-                 args.driver.col_padding_up,
-             REG_CONV_REG0);
-
-  reg_writeq((args.driver.image_win_cnt_last << 50) |
-                 (args.driver.image_win_cnt << 39) |
-                 (args.driver.image_block_amount_per_row << 20) |
-                 args.driver.filter_pad_width_mul_channel,
-             REG_CONV_REG1);
-
-  reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
-                 (args.driver.filter_row << 10) |
-                 (args.driver.filter_height << 5) | args.driver.filter_width,
-             REG_CONV_REG2);
-
-  reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
-                 (args.driver.prog_full_cnt << 16) |
-                 args.driver.filter_amount_all,
-             REG_CONV_REG3);
-
-  reg_writeq((args.driver.post_prog_full_cnt << 54) |
-                 (args.driver.last_cal_res_row_num << 50) |
-                 (args.driver.cal_res_num << 39) |
-                 (args.driver.res_row_data_align4_pad << 35) |
-                 (args.driver.output_amount_per_row << 16) |
-                 args.driver.output_width,
-             REG_CONV_REG4);
-
-  reg_writeq((args.driver.deconv_dump << 40) | (args.driver.deconv_ena << 39) |
-                 (args.driver.deconv_res_skip_row << 7) |
-                 args.driver.deconv_skip_row,
-             REG_CONV_REG5);
-
-  reg_writeq((args.driver.result_amount_per_row_multi_para << 43) |
-                 (args.driver.output_height << 32) |
-                 args.driver.output_address_phy,
-             REG_CONV_REG6);
-
-  reg_writeq((args.driver.filter_amount_whole << 48) |
-                 (args.driver.fpga_bias_scale_len << 32) |
-                 args.driver.sb_address_phy,
-             REG_CONV_REG7);
-
-  reg_writeq(
-      (args.driver.filters_amount_whole << 32) | args.driver.filter_address_phy,
-      REG_CONV_REG8);
-
-  reg_writeq((args.driver.image_amount_per_row << 43) |
-                 (args.driver.image_hight << 32) |
-                 args.driver.image_address_phy,
-             REG_CONV_REG9);
-
-  reg_writeq((args.driver.filter_pad_hight << 46) |
-                 (args.driver.image_amount_per_row_multi_win << 23) |
-                 args.driver.image_amount_per_row_multi_win_first,
-             REG_CONV_REG10);
-
-  reg_writeq((args.driver.image_block_num << 48) |
-                 (args.driver.image_block_len << 24) |
-                 args.driver.image_block_len_last,
-             REG_CONV_REG11);
-
-  reg_writeq(args.driver.cmd, REG_CONV_CMD);
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Conv Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Conv Wait Irq Timeout");
-  }
-  DLOG << "after reg poll";
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeBasicConv
-
-int ComputeFpgaPool(const struct PoolingArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaPool===========";
-  DLOG << "   mode:" << args.mode
-       << "   kernel_reciprocal:" << fp16_2_fp32(args.kernel_reciprocal);
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  // return 0;
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  image_physical_address = vaddr_to_paddr(args.image.address);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-  uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
-  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
-  uint64_t output_height = (uint64_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint64_t output_width = (uint64_t)(
-      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      (uint64_t)args.image.width * (uint64_t)args.image.channels +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
-  uint64_t result_addr_row =
-      (result_amount_align_32 << 32) | output_physical_address;
-  uint64_t row_padding_down =
-      (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
-  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
-  uint64_t kernel_padding_step = row_padding_down |
-                                 ((uint64_t)args.image.pad_height << 16) |
-                                 ((uint64_t)args.kernel.stride_h << 24) |
-                                 ((uint64_t)kernel_width_sub1 << 32) |
-                                 ((uint64_t)args.kernel.height << 40) |
-                                 ((uint64_t)(args.kernel.height - 1) << 48);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      (output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t result_size_calcu_height = (output_height - 1) |
-                                      ((output_width - 1) << 16) |
-                                      (image_calcu_height << 32);
-  uint64_t col_padding_down =
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
-      (uint64_t)args.image.channels;
-
-  uint64_t image_row_col_padding_down =
-      image_amount_per_row | (col_padding_down << 32);
-  uint64_t image_rowXpadding_h =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_rowXstep_h =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t image_rowXpad_h_rowXstep_h =
-      image_rowXpadding_h | (image_rowXstep_h << 32);
-  uint64_t channelXpad_w =
-      (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
-  uint64_t channelXstep_w =
-      (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
-  uint64_t channelXpad_w_channelXstep_w =
-      channelXpad_w | (channelXstep_w << 32);
-  uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align =
-      C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
-  uint64_t mult_factor = 0;
-  float average_reciprocal = args.kernel_reciprocal;
-  uint32_t *kernel_reciprocal;
-  kernel_reciprocal = (reinterpret_cast<uint32_t *>(&average_reciprocal));
-  if (args.mode == 1)
-    mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) |
-                  ((uint64_t)1 << 40);
-  else
-    mult_factor =
-        (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, 0x808);
-  reg_writeq(result_addr_row, 0x810);
-  reg_writeq(kernel_padding_step, 0x818);
-  reg_writeq(result_size_calcu_height, 0x820);
-  reg_writeq((uint64_t)args.image.channels, 0x828);
-  reg_writeq(image_row_col_padding_down, 0x830);
-  reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
-  reg_writeq(mult_factor, 0x840);  // dw donot care
-  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
-  if (args.mode == 1)
-    cmd = (uint64_t)4;
-  else
-    cmd = (uint64_t)8;
-  reg_writeq(cmd, 0x800);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "Pooling Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaPool
-
-int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaEWAdd===========";
-  DLOG << "   relu_enabled:" << args.relu_enabled;
-  DLOG << "   const0:" << fp16_2_fp32(int16_t(args.const0))
-       << "   const1:" << fp16_2_fp32(int16_t(args.const1));
-  DLOG << "   image0_address:" << args.image0.address
-       << "   image0_scale_address:" << args.image0.scale_address
-       << "   image0_channels:" << args.image0.channels
-       << "   image0_height:" << args.image0.height
-       << "   image0_width:" << args.image0.width
-       << "   pad0_height:" << args.image0.pad_height
-       << "   pad0_width:" << args.image0.pad_width;
-  DLOG << "   image1_address:" << args.image1.address
-       << "   image1_scale_address:" << args.image1.scale_address
-       << "   image1_channels:" << args.image1.channels
-       << "   image1_height:" << args.image1.height
-       << "   image1_width:" << args.image1.width
-       << "   pad1_height:" << args.image1.pad_height
-       << "   pad_width:" << args.image1.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  int ret = 0;
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
-    ret = -EIO;
-    DLOG << "EW Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-
-  uint64_t image0_physical_address = 0;
-  uint64_t image1_physical_address = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-  image0_physical_address = vaddr_to_paddr(args.image0.address);
-  image1_physical_address = vaddr_to_paddr(args.image1.address);
-  image_physical_address =
-      image0_physical_address | (image1_physical_address << 32);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t result_addr_row =
-      output_physical_address | (image_amount_per_row << 32);
-  uint64_t kernel_padding_step = 0;
-  kernel_padding_step = ((uint64_t)args.image0.height * 2) |
-                        ((uint64_t)2 << 24) | ((uint64_t)2 << 40) |
-                        ((uint64_t)1 << 48);
-  uint64_t result_size_calcu_height =
-      ((uint64_t)args.image0.height - 1) |
-      ((image_amount_per_row / 32 - 1) << 16) |
-      (((uint64_t)args.image0.height * 2) << 32);
-  uint64_t image_row_col_padding_down =
-      image_amount_per_row | (image_amount_per_row << 32);
-  float quantParam =
-      ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
-  uint32_t *ew_scale = reinterpret_cast<uint32_t *>(&quantParam);
-  uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) |
-                                  ((uint64_t)args.const1 << 40);
-  reg_writeq(0ul, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, 0x808);
-  reg_writeq(result_addr_row, 0x810);
-  reg_writeq(kernel_padding_step, 0x818);
-  reg_writeq(result_size_calcu_height, 0x820);
-  reg_writeq(32, 0x828);
-  reg_writeq(image_row_col_padding_down, 0x830);
-  reg_writeq(((image_amount_per_row * 2) << 32), 0x838);
-  reg_writeq(ew_scale_mult_factor, 0x840);  // dw donot care
-  reg_writeq(((uint64_t)32 << 32), 0x848);
-  reg_writeq(0, 0x858);
-  uint64_t cmd = 0;
-  cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8);
-  reg_writeq(cmd, 0x800);
-
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
-    ret = -EIO;
-    DLOG << "EW Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
-  }
-
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // ComputeFpgaEWAdd
-
-int PerformBypass(const struct BypassArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-  DLOG << "   input_type:" << args.input_data_type
-       << "   output_type:" << args.output_data_type
-       << "   input_layout_type:" << args.input_layout_type
-       << "   output_layout_type:" << args.output_layout_type;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  uint64_t output_scale = 0;
-  uint64_t timer_cnt = 0;
-  uint64_t cmd = 0;
-  uint64_t datalen = 0;
-  uint64_t input_address_phy = 0;
-  uint64_t output_address_phy = 0;
-  uint8_t data_cell_in = 0;
-  uint8_t data_cell_out = 0;
-  int ret = 0;
-
-  uint64_t reg_ActivationArgs = 0;
-  ActivationArgs active_args;
-  active_args.activation_type = args.output.activation.activation_type;
-
-  active_args.leaky_relu_negative_slope =
-      args.output.activation.leaky_relu_negative_slope;
-
-  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-                       active_args.leaky_relu_negative_slope;
-
-  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
-            (uint64_t)args.image.channels;
-  datalen = align_to_x(datalen, 16);
-  input_address_phy = vaddr_to_paddr_driver(args.image.address);
-  output_address_phy = vaddr_to_paddr_driver(args.output.address);
-  DLOG << "input_phy:" << input_address_phy;
-  DLOG << "output_phy:" << output_address_phy;
-
-  switch (args.input_data_type) {
-    case DATA_TYPE_FP16: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP16_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP16;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP16_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    case DATA_TYPE_INT8: {
-      if (args.output_data_type != DATA_TYPE_FP16) {
-        DLOG << "error:Output Datetype error,not DATA_TYPE_FP16: "
-             << args.output_data_type;
-      }
-      data_cell_in = SIZE_INT8;
-      data_cell_out = SIZE_FP16;
-      cmd = CMD_INT8_TO_FP16;
-    } break;
-
-    case DATA_TYPE_FP32: {
-      switch (args.output_data_type) {
-        case DATA_TYPE_FP16:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP16;
-          cmd = CMD_FP32_TO_FP16;
-          break;
-
-        case DATA_TYPE_FP32:
-          data_cell_in = SIZE_FP32;
-          data_cell_out = SIZE_FP32;
-          cmd = CMD_FP32_TO_FP32;
-          break;
-
-        default:
-          break;
-      }
-    } break;
-
-    default:
-      break;
-  }
-  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
-      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32 &&
-      cmd != CMD_INT8_TO_FP16) {
-    //   std::cout<< " err back Error1!" <<std::endl;
-    return -EFAULT;
-  }
-  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32 &&
-       data_cell_in != SIZE_INT8) ||
-      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
-    return -EFAULT;
-  }
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
-    ret = -EIO;
-    DLOG << "Bypass Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  reg_writeq(reg_ActivationArgs,
-             REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
-  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
-  reg_writeq(datalen, REG_CONVERT_RD_LENGTH);
-  reg_writeq(datalen, REG_CONVERT_WR_LENGTH);
-  reg_writeq(cmd, REG_CONVERT_CMD);
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
-    ret = -EIO;
-    DLOG << "BYPASS Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "BYPASS Wait Irq Timeout!");
-  }
-  DLOG << "after reg poll";
-
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}  // PerformBypass
-
-uint64_t FPGAVersion() {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaBypass===========";
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  uint64_t fpga_ver = 0;
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  fpga_ver = reg_readq(REG_HARDWARE_STATUS);
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return fpga_ver;
-#endif
-  return 0;
-}  // FPGAVersion
-
-int ComputeFPGAConcat(const struct ConcatArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out
-       << "   out_channel:" << args.out_channel;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:"
-         << args.channel_num[i]
-         //<< "   aligned_channel_num:" << args.aligned_channel_num[i]
-         << "   image_address:" << args.images_in[i]
-         << "   image_scale_address:" << args.scales_in[i];
-  }
-#endif
-
-  image::concat_images(args.images_in, args.scales_in, args.image_out,
-                       args.scale_out, args.image_num, args.channel_num,
-                       args.height, args.width);
-  return 0;
-}  // ComputeFPGAConcat
-
-void deconv_post_process(const struct DeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, 16);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.split_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t =
-          (int16_t *)(args.split_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                          ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-void DWDeconv_post_process(const struct DWDeconvArgs &args) {
-  int sub_conv_n = args.sub_conv_num;
-  int sub_height = args.sub_output_height;
-  int sub_width = args.sub_output_width;
-  int omit_size = args.omit_size;
-  int channel = args.filter_num;
-  int num = 1;
-  int origin_h = sub_height * sub_conv_n;
-  int origin_w = sub_width * sub_conv_n;
-  int align_origin_w = align_to_x(origin_w * channel, IMAGE_ALIGNMENT);
-  int deconv_h = origin_h - 2 * omit_size;
-  int deconv_w = origin_w - 2 * omit_size;
-  int deconv_row_len = deconv_w * channel;
-  int align_deconv_row_len = align_to_x(deconv_row_len, IMAGE_ALIGNMENT);
-
-  for (int idx = 0; idx < sub_conv_n; ++idx) {
-    paddle_mobile::fpga::fpga_invalidate(
-        args.dw_conv_args[idx]->output.address,
-        align_origin_w * origin_h * sizeof(int16_t));
-  }
-
-  int deconv_idx = 0;
-  for (int nn = 0; nn < num; ++nn) {
-    for (int hh = 0; hh < origin_h; ++hh) {
-      int hx = (hh % sub_conv_n);
-      auto sub_t = (int16_t *)(args.dw_conv_args[sub_conv_n - hx - 1]  // NOLINT
-                                   ->output.address);
-      int hi = (hh / sub_conv_n);
-      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
-      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
-                  omit_size * channel);
-      fpga_copy((int16_t *)(args.output.address) + deconv_idx,    // NOLINT
-                sub_t + sidx, sizeof(int16_t) * deconv_row_len);  // NOLINT
-      deconv_idx += align_deconv_row_len;
-    }
-  }
-  fpga_flush(args.output.address,
-             num * align_deconv_row_len * deconv_h * sizeof(int16_t));
-}
-
-int ComputeFpgaDeconv(const struct DeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeFpgaConv(*args.split_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  /*if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.split_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.split_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.split_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }*/
-
-  return 0;
-}  // ComputeFpgaDeconv
-
-int ComputeFPGASplit(const struct SplitArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFpgaSplit===========";
-  DLOG << "   Image_num: " << args.image_num
-       << "   in_address:" << args.image_in
-       << "   in_scale_address:" << args.scale_in;
-  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
-  for (int i = 0; i < args.image_num; i++) {
-    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.out_channel_nums[i]
-         << "   image_address:" << args.images_out[i]
-         << "   image_scale_address:" << args.scales_out[i];
-  }
-#endif
-  image::split_image(args.image_in, args.images_out, args.image_num,
-                     args.out_channel_nums, args.height, args.width);
-  return 0;
-}  // ComputeFPGASplit
-int ComputeDWConv(const struct DWconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeDWConv===========";
-  // DLOG << "   mode:" << args.relu_enabled;
-  DLOG << "   image_address:" << args.image.address
-       << "   image_scale_address:" << args.image.scale_address
-       << "   image_channels:" << args.image.channels
-       << "   image_height:" << args.image.height
-       << "   image_width:" << args.image.width
-       << "   pad_height:" << args.image.pad_height
-       << "   pad_width:" << args.image.pad_width;
-  DLOG << "   filter_address:" << args.filter_address;
-  //<< "   bias_address:" << args.bias_address;
-  DLOG << "   kernel_height:" << args.kernel.height
-       << "   kernel_width:" << args.kernel.width
-       << "   stride_h:" << args.kernel.stride_h
-       << "   stride_w:" << args.kernel.stride_w;
-  DLOG << "   out_address:" << args.output.address
-       << "   out_scale_address:" << args.output.scale_address;
-#endif
-#ifdef PADDLE_MOBILE_ZU5
-  DLOG << "DWConv";
-  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
-  // return 0;
-  uint64_t timer_cnt = 0;
-  int ret = 0;
-  uint64_t cmd = 0;
-  uint64_t image_physical_address = 0;
-  uint64_t output_physical_address = 0;
-  uint64_t filter_physical_address = 0;
-  uint64_t bias_physical_address = 0;
-
-  image_physical_address = vaddr_to_paddr(args.image.address);
-  output_physical_address = vaddr_to_paddr(args.output.address);
-  filter_physical_address = vaddr_to_paddr(args.filter_address);
-  bias_physical_address = vaddr_to_paddr(args.bias_address);
-  uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
-  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
-  uint64_t output_height = (uint64_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint64_t output_width = (uint64_t)(
-      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w +
-       1) *
-      args.sub_conv_num);
-
-  uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      (uint64_t)args.image.width * (uint64_t)args.image.channels +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
-  uint64_t result_addr_row =
-      (result_amount_align_32 << 32) | output_physical_address;
-  uint64_t row_padding_down =
-      (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
-  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
-  uint64_t kernel_padding_step = row_padding_down |
-                                 ((uint64_t)args.image.pad_height << 16) |
-                                 ((uint64_t)args.kernel.stride_h << 24) |
-                                 ((uint64_t)kernel_width_sub1 << 32) |
-                                 ((uint64_t)args.kernel.height << 40) |
-                                 ((uint64_t)(args.kernel.height - 1) << 48);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      (output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t result_size_calcu_height = (output_height - 1) |
-                                      ((output_width - 1) << 16) |
-                                      (image_calcu_height << 32);
-  uint64_t col_padding_down =
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
-      (uint64_t)args.image.channels;
-
-  uint64_t image_row_col_padding_down =
-      image_amount_per_row | (col_padding_down << 32);
-  uint64_t image_rowXpadding_h =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_rowXstep_h =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t image_rowXpad_h_rowXstep_h =
-      image_rowXpadding_h | (image_rowXstep_h << 32);
-  uint64_t channelXpad_w =
-      (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
-  uint64_t channelXstep_w =
-      (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
-  uint64_t channelXpad_w_channelXstep_w =
-      channelXpad_w | (channelXstep_w << 32);
-
-  uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align =
-      C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
-  uint64_t filter_amount_align =
-      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
-  uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
-                          (sub_filter_amount_align << 32) |
-                          (((uint64_t)args.sub_conv_num - 1) << 48);
-  uint64_t channel_parameter =
-      (uint64_t)args.image.channels | (C_align_64 << 16);
-  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
-  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
-    ret = -EIO;
-    DLOG << "Conv Status Error!";
-    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-    return ret;
-  }
-  reg_writeq(0ul, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, 0x808);
-  reg_writeq(result_addr_row, 0x810);
-  reg_writeq(kernel_padding_step, 0x818);
-  reg_writeq(result_size_calcu_height, 0x820);
-  reg_writeq(channel_parameter, 0x828);
-  reg_writeq(image_row_col_padding_down, 0x830);
-  reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
-  reg_writeq(0, 0x840);
-  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
-  reg_writeq(filter_physical_address, 0x850);
-  reg_writeq(filter_param, 0x858);
-  reg_writeq(((bias_physical_address + C_align_64 * 4) |
-              (bias_physical_address << 32)),
-             0x860);
-  cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
-  reg_writeq(cmd, 0x800);
-
-  DLOG << "before reg poll";
-  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
-    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
-    ret = -EIO;
-    DLOG << "DWconv Wait Irq Timeout!";
-    PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
-  }
-  DLOG << "after reg poll";
-  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
-  return ret;
-#endif
-  return 0;
-}
-int ComputeDWDeconv(const struct DWDeconvArgs &args) {
-#ifdef FPGA_PRINT_MODE
-  DLOG << "=============ComputeFPGADeConv===========";
-  DLOG << "   filter_num:" << args.filter_num
-       << "   group_num:" << args.group_num << "omit_size:" << args.omit_size
-       << "sub_output_width: " << args.sub_output_width
-       << "sub_output_height: " << args.sub_output_height
-       << "   sub_conv_num:" << args.sub_conv_num;
-  DLOG << "args.output.address: " << args.output.address
-       << "args.output.scale_address: " << args.output.scale_address;
-
-#endif
-
-  int sub_conv_num = args.sub_conv_num;
-
-#ifdef COST_TIME_PRINT
-  timeval start, end;
-  long dif_sec, dif_usec;  // NOLINT
-#endif
-
-  for (int i = 0; i < sub_conv_num; i++) {
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-
-    ComputeDWConv(*args.dw_conv_args[i]);
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv basic_conv: " << i << " times:  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-  if (sub_conv_num > 1) {
-    float max_scale = -1.0f;
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start, NULL);
-#endif
-    for (int i = 0; i < sub_conv_num; i++) {
-      paddle_mobile::fpga::fpga_invalidate(
-          args.dw_conv_args[i]->output.scale_address, 2 * sizeof(float));
-      float ptr_scale = (args.dw_conv_args[i]->output.scale_address)[0];
-      if (ptr_scale > max_scale) {
-        args.output.scale_address[0] = ptr_scale;
-        args.output.scale_address[1] =
-            (args.dw_conv_args[i]->output.scale_address)[1];
-      }
-    }
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end, NULL);
-    dif_sec = end.tv_sec - start.tv_sec;
-    dif_usec = end.tv_usec - start.tv_usec;
-    std::cout << "deconv scale  "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-              << std::endl;
-#endif
-  }
-
-#ifdef COST_TIME_PRINT
-  gettimeofday(&start, NULL);
-#endif
-  DWDeconv_post_process(args);
-#ifdef COST_TIME_PRINT
-  gettimeofday(&end, NULL);
-  dif_sec = end.tv_sec - start.tv_sec;
-  dif_usec = end.tv_usec - start.tv_usec;
-  std::cout << "deconv_post_process  "
-            << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
-            << std::endl;
-#endif
-  return 0;
-}  // ComputeFpgaDeconv
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/config.h b/mobile/src/fpga/common/config.h
deleted file mode 100644
index 27187c7b854c84d501949db41fe89f9dca1d2bf1..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/common/config.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_MOBILE_ZU5
-#define FPGA_PRINT_MODE
diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp
deleted file mode 100755
index b7ce4d32474465988f0e2c02763d21bfdf9a7530..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/common/driver.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/ioctl.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <algorithm>
-#include <cstddef>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <utility>
-
-#include "common/enforce.h"
-#include "fpga/common/driver.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace driver {
-struct FPGA_INFO g_fpgainfo;
-
-int open_drvdevice() {
-  if (g_fpgainfo.fd_drv == -1) {
-    g_fpgainfo.fd_drv = open(g_fpgainfo.drvdevice_path, O_RDWR);
-  }
-  return g_fpgainfo.fd_drv;
-}
-
-int open_memdevice() {
-  if (g_fpgainfo.fd_mem == -1) {
-    // g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR | O_DSYNC);
-    g_fpgainfo.fd_mem = open(g_fpgainfo.memdevice_path, O_RDWR);
-  }
-  return g_fpgainfo.fd_mem;
-}
-
-int close_drvdevice() { return close(g_fpgainfo.fd_drv); }
-
-int close_memdevice() { return close(g_fpgainfo.fd_mem); }
-
-void pl_reset() { usleep(100 * 1000); }
-
-void setup_pe(struct pe_data_s *pe_data, struct fpga_pe *pe,
-              char const *type_name, int pe_idx) {
-  memset(pe, 0, sizeof(struct fpga_pe));
-
-  pe->outer = pe_data;
-  snprintf(pe->type_name, MAX_TYPE_NAME_LENTH, "%s", type_name);
-
-  pe->status = IDLE;
-  pe->interrupt_cnt = 0;
-  pe_data->pes[pe_idx] = pe;
-  pe_data->pe_num++;
-}
-
-void pl_init() {
-  struct pe_data_s *pe_data = nullptr;
-
-  pl_reset();
-
-  pe_data = (struct pe_data_s *)malloc(sizeof(struct pe_data_s));
-  if (pe_data == nullptr) {
-    std::cout << "pe_data malloc error!" << std::endl;
-    return;
-  }
-  memset(pe_data, 0, sizeof(struct pe_data_s));
-  pthread_mutex_init(&pe_data->mutex, 0);
-
-  setup_pe(pe_data, &pe_data->pe_conv, "CONV", PE_IDX_CONV);
-  setup_pe(pe_data, &pe_data->pe_pooling, "POOLING", PE_IDX_POOLING);
-  setup_pe(pe_data, &pe_data->pe_ew, "EW", PE_IDX_EW);
-  setup_pe(pe_data, &pe_data->pe_bypass, "BYPASS", PE_IDX_BYPASS);
-
-  g_fpgainfo.pe_data = pe_data;
-}
-
-void pl_destroy() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-  pthread_mutex_destroy(&pe_data->mutex);
-  free(pe_data);
-}
-
-void pl_start() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-
-  pthread_mutex_unlock(&pe_data->mutex);
-}
-
-void pl_stop() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-
-  pthread_mutex_lock(&pe_data->mutex);
-}
-
-void pl_reinit() {
-  struct pe_data_s *pe_data = g_fpgainfo.pe_data;
-  struct fpga_pe *pe = nullptr;
-  int i = 0;
-
-  pl_stop();
-  pl_reset();
-  pl_start();
-
-  for (i = 0; i < pe_data->pe_num; i++) {
-    pe = pe_data->pes[i];
-    pe->status = IDLE;
-    pe->interrupt_cnt = 0;
-  }
-
-  pl_start();
-}
-
-int pl_get_status() { return 0; }
-
-/*tmie单位us*/
-int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
-  uint64_t i = 0;
-  /*timeout精确性待确认*/
-  int64_t timeout = time * 6;
-
-  for (i = 0; i < timeout; i++) {
-    usleep(1);
-    if (val == reg_readq(reg)) {
-      break;
-    }
-  }
-
-  if (i < timeout) {
-    return 0;
-  } else {
-    return -1;
-  }
-}
-
-uint64_t vaddr_to_paddr_driver(void *address) {
-  uint64_t paddr = 0;
-  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
-  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
-    paddr = iter->second;
-  } else {
-    std::cout << "Invalid pointer: " << address << std::endl;
-  }
-
-  return paddr;
-}
-
-void *fpga_reg_malloc(size_t size) {
-  void *ret = nullptr;
-  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_drv, FPGA_REG_PHY_ADDR);
-  // PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
-
-  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
-
-  return ret;
-}
-
-void *fpga_reg_free(void *ptr) {
-  size_t size = 0;
-
-  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
-  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
-    size = iter->second;
-    g_fpgainfo.fpga_addr2size_map.erase(iter);
-    munmap(ptr, size);
-  } else {
-    std::cout << "Invalid pointer" << ptr << std::endl;
-  }
-}
-
-static inline int do_ioctl(int64_t req, const void *arg) {
-  return ioctl(g_fpgainfo.fd_mem, req, arg);
-}
-
-void *fpga_malloc_driver(size_t size) {
-  void *ret = nullptr;
-  uint64_t phy_addr = 0;
-  int i = 0;
-  struct MemoryVM2PHYArgs args;
-  struct MemoryCacheArgs args_c;
-  ret = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED,
-               g_fpgainfo.fd_mem, FPGA_MEM_PHY_ADDR);
-  PADDLE_MOBILE_ENFORCE(ret != (void *)-1, "Should not be -1");
-
-  args.pVM = reinterpret_cast<void *>(ret);
-  args.pPHY = reinterpret_cast<void *>(0);
-  do_ioctl(IOCTL_MEMORY_VM2PHY, &args);
-  phy_addr = (uint64_t)args.pPHY;
-
-  g_fpgainfo.fpga_vaddr2paddr_map.insert(std::make_pair(ret, phy_addr));
-  g_fpgainfo.fpga_addr2size_map.insert(std::make_pair(ret, size));
-
-  return ret;
-}
-
-void fpga_free_driver(void *ptr) {
-  size_t size = 0;
-  uint32_t pos = 0;
-  uint64_t p_addr = 0;
-
-  auto iter = g_fpgainfo.fpga_addr2size_map.find(ptr);
-  if (iter != g_fpgainfo.fpga_addr2size_map.end()) {
-    size = iter->second;
-    g_fpgainfo.fpga_addr2size_map.erase(iter);
-    munmap(ptr, size);
-    auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
-    if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
-      g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
-    }
-  } else {
-    std::cout << "Invalid pointer" << ptr << std::endl;
-  }
-}
-
-int fpga_flush_driver(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  uint64_t p_addr;
-
-  p_addr = vaddr_to_paddr_driver(address);
-
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
-  args.size = size;
-
-  return do_ioctl(IOCTL_MEMCACHE_FLUSH, &args);
-}
-
-int fpga_invalidate_driver(void *address, size_t size) {
-  struct MemoryCacheArgs args;
-  uint64_t p_addr;
-
-  p_addr = vaddr_to_paddr_driver(address);
-
-  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
-  args.size = size;
-
-  return do_ioctl(IOCTL_MEMCACHE_INVAL, &args);
-}
-
-void fpga_copy_driver(void *dest, const void *src, size_t num) {
-  uint64_t i;
-  for (i = 0; i < num; i++) {
-    *((int8_t *)dest + i) = *((int8_t *)src + i);  // NOLINT
-  }
-
-  return;
-}
-
-int open_device_driver() {
-  g_fpgainfo.FpgaRegPhyAddr = FPGA_REG_PHY_ADDR;
-  g_fpgainfo.FpgaMemPhyAddr = FPGA_MEM_PHY_ADDR;
-  g_fpgainfo.FpgaRegVirAddr = nullptr;
-  g_fpgainfo.pe_data = nullptr;
-  g_fpgainfo.drvdevice_path = "/dev/fpgadrv0";
-  g_fpgainfo.memdevice_path = "/dev/fpgamem0";
-  g_fpgainfo.fd_drv = -1;
-  g_fpgainfo.fd_mem = -1;
-
-  int ret = 0;
-  ret = open_drvdevice();
-  ret |= open_memdevice();
-
-  g_fpgainfo.FpgaRegVirAddr =
-      (uint64_t *)fpga_reg_malloc(FPGA_REG_SIZE);  // NOLINT
-  pl_init();
-  return ret;
-}
-
-int close_device_driver() {
-  pl_destroy();
-  fpga_reg_free(g_fpgainfo.FpgaRegVirAddr);
-  int ret = 0;
-  ret = close_drvdevice();
-  ret |= close_memdevice();
-  return ret;
-}
-
-}  // namespace driver
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/driver.h b/mobile/src/fpga/common/driver.h
deleted file mode 100644
index 87c68cbb5a1abe935b97ed9783785be65030ffff..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/common/driver.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ctype.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <cstring>
-#include <map>
-
-#include "common/log.h"
-
-namespace paddle_mobile {
-namespace fpga {
-namespace driver {
-
-#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
-
-#define FPGA_REG_PHY_ADDR 0x80000000
-#define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x20000000
-#define FPGA_MEM_SIZE 0x20000000
-
-#define FPGA_PAGE_SIZE (16UL * 1024UL)
-
-// PE related macros
-const int MAX_NUM_PES = 6;
-const size_t MAX_TYPE_NAME_LENTH = 8;
-
-const int PE_IDX_CONV = 0;
-const int PE_IDX_POOLING = 1;
-const int PE_IDX_EW = 2;
-const int PE_IDX_BYPASS = 3;
-
-enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 };
-
-struct MemoryCacheArgs {
-  void *offset;
-  size_t size;
-};
-
-struct MemoryVM2PHYArgs {
-  void *pVM;
-  void *pPHY;
-};
-
-#define IOCTL_FPGA_MAGIC 'F'
-#define IOCTL_MEMCACHE_INVAL _IOW(IOCTL_FPGA_MAGIC, 12, struct MemoryCacheArgs)
-#define IOCTL_MEMCACHE_FLUSH _IOW(IOCTL_FPGA_MAGIC, 13, struct MemoryCacheArgs)
-#define IOCTL_MEMORY_VM2PHY _IOWR(IOCTL_FPGA_MAGIC, 15, struct MemoryVM2PHYArgs)
-
-struct fpga_pe {
-  char type_name[MAX_TYPE_NAME_LENTH + 1];
-  struct pe_data_s *outer;
-  pe_status status;
-  uint64_t interrupt_cnt;
-};
-
-struct pe_data_s {
-  pthread_mutex_t mutex;
-  struct fpga_pe pe_conv;
-  struct fpga_pe pe_pooling;
-  struct fpga_pe pe_ew;
-  struct fpga_pe pe_bypass;
-
-  struct fpga_pe *pes[MAX_NUM_PES];
-  int pe_num;
-};
-
-struct fpga_memory {
-  pthread_mutex_t mutex;
-  uint64_t *bitmap;
-  unsigned int *nr;
-  unsigned int page_num;
-  unsigned int page_num_long;
-  uint64_t mem_start;
-  uint64_t mem_end;
-};
-
-struct FPGA_INFO {
-  uint64_t FpgaRegPhyAddr;
-  uint64_t FpgaMemPhyAddr;
-  pthread_t poll_pid;
-  void *FpgaRegVirAddr;
-  struct pe_data_s *pe_data;
-
-  std::map<void *, size_t> fpga_addr2size_map;
-  std::map<void *, uint64_t> fpga_vaddr2paddr_map;
-  const char *drvdevice_path;
-  const char *memdevice_path;
-  struct fpga_memory *memory_info;
-  int fd_drv;
-  int fd_mem;
-};
-
-extern struct FPGA_INFO g_fpgainfo;
-
-inline uint64_t reg_readq(uint32_t offset) {
-  uint64_t value =
-      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
-                             offset);                                // NOLINT
-  return value;
-}
-
-inline void reg_writeq(uint64_t value, uint32_t offset) {
-  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
-                         offset) = value;
-}
-
-int open_device_driver();
-
-int close_device_driver();
-
-void *fpga_malloc_driver(size_t size);
-
-void fpga_free_driver(void *ptr);
-
-int fpga_flush_driver(void *address, size_t size);
-
-int fpga_invalidate_driver(void *address, size_t size);
-
-uint64_t vaddr_to_paddr_driver(void *address);
-
-int fpga_regpoll(uint64_t reg, uint64_t val, int time);
-
-}  // namespace driver
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/fpga_common.cpp b/mobile/src/fpga/common/fpga_common.cpp
deleted file mode 100644
index 2c589b3ef6250275acd82d4a04d38620ac410ba4..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/common/fpga_common.cpp
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "fpga/common/fpga_common.h"
-#include <algorithm>
-#include <map>
-#include <utility>
-#include "fpga/common/config.h"
-#include "fpga/common/driver.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-int16_t fp32_2_fp16(float fp32_num) {
-  int32_t tmp = *(reinterpret_cast<int32_t *>(&fp32_num));
-  int16_t se_fp32 = (tmp >> 23) & 0x1ff;
-  int32_t m_fp32 = tmp & 0x007fffff;
-  int16_t se_fp16 = 0;
-  int16_t m_fp16 = 0;
-
-  if (se_fp32 < 103) {
-    se_fp16 = 0x0000;
-    m_fp16 = m_fp32 >> 24;
-  } else if (se_fp32 < 113) {
-    se_fp16 = (0x0400 >> (113 - se_fp32));
-    m_fp16 = m_fp32 >> (126 - se_fp32);
-  } else if (se_fp32 <= 142) {
-    se_fp16 = (se_fp32 - 112) << 10;
-    m_fp16 = m_fp32 >> 13;
-  } else if (se_fp32 < 255) {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 24;
-  } else if (se_fp32 == 255) {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 13;
-  } else if (se_fp32 < 359) {
-    se_fp16 = 0x8000;
-    m_fp16 = m_fp32 >> 24;
-  } else if (se_fp32 < 369) {
-    se_fp16 = (0x0400 >> (369 - se_fp32)) | 0x8000;
-    m_fp16 = m_fp32 >> (382 - se_fp32);
-  } else if (se_fp32 <= 398) {
-    se_fp16 = ((se_fp32 - 368) << 10) | 0x8000;
-    m_fp16 = m_fp32 >> 13;
-  } else if (se_fp32 < 511) {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 24;
-  } else {
-    se_fp16 = 0x7C00;
-    m_fp16 = m_fp32 >> 13;
-  }
-  int16_t result = se_fp16 + m_fp16;
-  return result;
-}
-
-int32_t convertmantissa(int32_t i) {
-  int32_t m = i << 13;
-  int32_t e = 0;
-  while (!(m & 0x00800000)) {
-    e -= 0x00800000;
-    m <<= 1;
-  }
-  m &= ~0x00800000;
-  e += 0x38800000;
-  return m | e;
-}
-
-float fp16_2_fp32(int16_t fp16_num) {
-  int16_t se_fp16 = (fp16_num >> 10) & 0x3f;
-  int16_t m_fp16 = fp16_num & 0x3ff;
-  int32_t e_fp32 = 0;
-  int16_t offset = 0;
-  int32_t m_fp32 = 0;
-  if (se_fp16 == 0) {
-    e_fp32 = 0;
-    offset = 0;
-  } else if (se_fp16 < 31) {
-    e_fp32 = se_fp16 << 23;
-    offset = 1024;
-  } else if (se_fp16 == 31) {
-    e_fp32 = 0x47800000;
-    offset = 1024;
-  } else if (se_fp16 == 32) {
-    e_fp32 = 0x80000000;
-    offset = 0;
-  } else if (se_fp16 < 63) {
-    e_fp32 = 0x80000000 + ((se_fp16 - 32) << 23);
-    offset = 1024;
-  } else {  // se_fp16 == 63
-    e_fp32 = 0xC7800000;
-    offset = 1024;
-  }
-  int16_t a = offset + m_fp16;
-  if (a == 0) {
-    m_fp32 = 0;
-  } else if (a < 1024) {
-    int32_t tmp = a;
-    m_fp32 = convertmantissa(tmp);
-  } else {
-    int32_t tmp = a - 1024;
-    m_fp32 = 0x38000000 + (tmp << 13);
-  }
-
-  int32_t tmp = e_fp32 + m_fp32;
-  float fp32_num = *(reinterpret_cast<float *>(&tmp));
-  return fp32_num;
-}
-
-static std::map<void *, size_t> memory_map;
-
-int open_device() {
-  int ret = driver::open_device_driver();
-  return ret;
-}
-
-int close_device() {
-  int ret = driver::close_device_driver();
-  return ret;
-}
-
-void *fpga_malloc(size_t size) {
-  static uint64_t counter = 0;
-  if (size <= 0) {
-    size = 1;
-  }
-#ifdef PADDLE_MOBILE_ZU5
-  auto ptr = driver::fpga_malloc_driver(size);
-#else
-  auto ptr = malloc(size);
-#endif
-  counter += size;
-  memory_map.insert(std::make_pair(ptr, size));
-  //  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
-  //       << counter << " bytes";
-  return ptr;
-}
-
-void fpga_free(void *ptr) {
-  if (ptr == nullptr) {
-    return;
-  }
-  static uint64_t counter = 0;
-  size_t size = 0;
-  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
-  if (iter != memory_map.end()) {
-    size = iter->second;
-    memory_map.erase(iter);
-#ifdef PADDLE_MOBILE_ZU5
-    driver::fpga_free_driver(ptr);
-#else
-    free(ptr);
-#endif
-    counter += size;
-    //    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
-    //         << counter << " bytes";
-  } else {
-    DLOG << "Address: " << ptr << "  Invalid pointer";
-  }
-}
-void fpga_copy(void *dest, const void *src, size_t num) {
-#ifdef PADDLE_MOBILE_ZU5
-  // driver::fpga_copy_driver(dest, src, num);
-  memcpy(dest, src, num);
-#else
-  memcpy(dest, src, num);
-#endif
-}
-
-int fpga_flush(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_flush_driver(address, size);
-#else
-  return 0;
-#endif
-}
-int fpga_invalidate(void *address, size_t size) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::fpga_invalidate_driver(address, size);
-#else
-  return 0;
-#endif
-}
-uint64_t vaddr_to_paddr(void *address) {
-#ifdef PADDLE_MOBILE_ZU5
-  return driver::vaddr_to_paddr_driver(address);
-#else
-  return 0;
-#endif
-}
-
-uint32_t paddle_mobile_version() {
-  uint32_t v_master = 52;
-  uint32_t v_slave = 52;
-
-  uint32_t first = 1, second = 2, fourth_master = 1, fourth_slave = 1;
-  uint32_t master = first << 24 | second << 16 | v_master << 8 | fourth_master;
-  uint32_t slave = first << 24 | second << 16 | v_slave << 8 | fourth_slave;
-
-  return slave;
-}
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h
deleted file mode 100755
index a767cd2606bb351b42f8d2a6bc944c66a2fa39a7..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/common/fpga_common.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#define IMAGE_ALIGNMENT (16)           // Aligned to 16
-#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT (8)
-#define BIAS_NUM_ALIGNMENT (16)
-#define ROW_PARALLEL_NUM (2)
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#define IMAGE_ALIGNMENT (32)           // Aligned to 32
-#define FILTER_NUM_ALIGNMENT (32)      // Filter number aligned to 32
-#define FILTER_ELEMENT_ALIGNMENT (16)  // Filter element number aligned to 16
-#define BS_NUM_ALIGNMENT (8)
-#define BIAS_SCALE_DMA_NUM (4)
-#define RESULT_ALIGNMENT (32)
-
-#define PE_COLUMN (8)
-#define ROW_PARALLEL_NUM (2)
-
-#define BIAS_NUM_ALIGNMENT (16)
-
-#endif
-
-namespace paddle_mobile {
-namespace fpga {
-
-enum DataType {
-  DATA_TYPE_INT8 = 2,
-  DATA_TYPE_FP32 = 1,
-  DATA_TYPE_FP16 = 0,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-enum ActivationType {
-  NONE = 0,
-  LEAKYRELU = 1,
-  SIGMOID = 2,
-  TANH = 3,
-  SOFTMAX = 4,
-};
-
-struct ActivationArgs {
-  enum ActivationType activation_type = NONE;
-  int16_t leaky_relu_negative_slope;
-};
-
-struct KernelArgs {
-  uint32_t width;
-  uint32_t height;
-  uint32_t stride_w;
-  uint32_t stride_h;
-};
-
-struct ImageInputArgs {
-  void* address;         // input featuremap virtual address
-  float* scale_address;  // input scale address;
-  uint32_t channels;
-  uint32_t width;  // featuremap width
-  uint32_t height;
-  uint32_t pad_width;  // padding width;
-  uint32_t pad_height;
-};
-
-struct ImageOutputArgs {
-  void* address;         // output result address;
-  float* scale_address;  // output scale address;
-  uint64_t timer_cnt;    // time counter for FPGA computation
-  struct ActivationArgs
-      activation;  // To select activation and specify (Leaky)Relu parameter.
-};
-
-// #ifdef PADDLE_MOBILE_FPGA_V1
-struct ConvDriverParam {
-  uint64_t filter_per_group;
-  uint64_t channel_per_group;
-  uint64_t image_one_pad_per_row;
-  uint64_t deconv_param;
-
-  // new
-  uint64_t col_padding_up;
-  uint64_t col_padding_down;
-  uint64_t row_padding_up;
-  uint64_t row_padding_down;
-
-  uint64_t image_block_amount_per_row;
-  uint64_t filter_pad_width_mul_channel;
-  uint64_t image_win_cnt;
-  uint64_t image_win_cnt_last;
-
-  uint64_t filter_row;
-  uint64_t filter_width;
-  uint64_t filter_height;
-  uint64_t skip_window;
-  uint64_t stride_h;
-
-  uint64_t filter_amount_all;
-  uint64_t prog_full_cnt;
-  uint64_t filter_align;
-  uint64_t filter_num;
-
-  uint64_t output_width;
-  uint64_t output_amount_per_row;
-  uint64_t res_row_data_align4_pad;
-  uint64_t cal_res_num;
-  uint64_t last_cal_res_row_num;
-  uint64_t post_prog_full_cnt;
-
-  uint64_t deconv_skip_row;      // paralvl*deconv_group
-  uint64_t deconv_res_skip_row;  // deconv_group * result_amount_per_row
-  uint64_t deconv_ena;
-  uint64_t deconv_dump;
-
-  uint64_t output_address_phy;
-  uint64_t output_height;
-  uint64_t result_amount_per_row_multi_para;
-
-  uint64_t sb_address_phy;
-  uint64_t fpga_bias_scale_len;
-  uint64_t filter_amount_whole;
-
-  uint64_t filter_address_phy;
-  uint64_t filters_amount_whole;
-
-  uint64_t image_address_phy;
-  uint64_t image_hight;
-  uint64_t image_amount_per_row;
-
-  uint64_t image_amount_per_row_multi_win_first;
-  uint64_t image_amount_per_row_multi_win;
-  uint64_t filter_pad_hight;
-
-  uint64_t image_block_num;
-  uint64_t image_block_len;
-  uint64_t image_block_len_last;
-
-  uint64_t cmd;
-};
-
-struct EWAddDriverParam {
-  uint64_t image0_address_phy;
-  uint64_t image1_address_phy;
-  uint64_t datalen;
-  uint64_t image_image_pixel;
-  uint64_t image_amount_per_row;
-  uint64_t output_address_phy;
-  uint64_t coefficient;
-  uint64_t cmd;
-};
-
-struct DeconvTxParm {
-  uint32_t omit_size;
-  uint32_t sub_conv_num;
-  uint32_t deconv_en;
-  uint32_t out_addr_offset;
-};
-
-struct ConvArgs {
-  bool relu_enabled;
-  void* sb_address;  // scale and bias
-  void* filter_address;
-  float* filter_scale_address;
-  uint32_t filter_num;
-  uint32_t group_num;
-
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-
-  // #ifdef PADDLE_MOBILE_FPGA_V1
-  struct DeconvTxParm deconv_tx_param;
-  struct ConvDriverParam driver;
-};
-
-struct ConcatArgs {
-  uint32_t image_num;
-#ifdef PADDLE_MOBILE_FPGA_V2
-  int8_t** images_in;
-#else
-  int16_t** images_in;
-#endif
-  float** scales_in;
-  void* image_out;
-  float* scale_out;
-  uint32_t* channel_num;
-  uint32_t* aligned_channel_num;  // Not used so far. Reserved for V2.
-  uint32_t out_channel;
-  uint32_t height;
-  uint32_t width;
-  std::vector<std::shared_ptr<char>> vector_concat_space;
-};
-
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-  std::shared_ptr<ConvArgs> shared_conv_arg;
-  std::vector<std::shared_ptr<char>> vector_concat_space;
-  std::vector<std::shared_ptr<char>> vector_conv_space;
-};
-
-struct SplitArgs {
-  uint32_t image_num;
-#ifdef PADDLE_MOBILE_FPGA_V2
-  int8_t* image_in;
-#else
-  int16_t* image_in;
-#endif
-  float* scale_in;
-  void** images_out;
-  float** scales_out;
-  uint32_t* out_channel_nums;
-  uint32_t height;
-  uint32_t width;
-  std::vector<std::shared_ptr<char>> vector_split_space;
-};
-
-struct PoolingArgs {
-  int16_t mode;  // mode: 0:max, 1:avg
-  int16_t kernel_reciprocal;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;  // input image;
-  struct ImageOutputArgs output;
-};
-
-struct EWAddArgs {
-  bool relu_enabled;
-  uint32_t const0;  // output0 = const0 x input0 + const1 x input1;
-  uint32_t const1;
-  struct ImageInputArgs image0;
-  struct ImageInputArgs image1;
-  struct ImageOutputArgs output;
-  // #ifdef PADDLE_MOBILE_FPGA_V1
-  struct EWAddDriverParam driver;
-};
-
-struct BypassArgs {
-  enum DataType input_data_type;
-  enum DataType output_data_type;
-  enum LayoutType input_layout_type;
-  enum LayoutType output_layout_type;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-};
-
-struct DeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  std::vector<std::shared_ptr<SplitConvArgs>> split_conv_args;
-};
-struct DWconvArgs {
-  uint32_t sub_conv_num;
-  bool relu_enabled;
-  void* bias_address;
-  void* filter_address;
-  struct KernelArgs kernel;
-  struct ImageInputArgs image;
-  struct ImageOutputArgs output;
-  std::vector<std::shared_ptr<char>> vector_dwconv_space;
-};
-
-struct DWDeconvArgs {
-  uint32_t sub_conv_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  uint32_t omit_size;
-  uint32_t sub_output_width;
-  uint32_t sub_output_height;
-  struct ImageOutputArgs output;
-  std::vector<std::shared_ptr<DWconvArgs>> dw_conv_args;
-  std::vector<std::shared_ptr<char>> vector_dw_conv_space;
-};
-
-// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
-// }
-static inline uint32_t align_to_x(int64_t num, int64_t x) {
-  return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
-}
-
-int16_t fp32_2_fp16(float fp32_num);
-float fp16_2_fp32(int16_t fp16_num);
-
-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-void fpga_copy(void* dest, const void* src, size_t num);
-int fpga_flush(void* address, size_t size);
-int fpga_invalidate(void* address, size_t size);
-
-uint64_t vaddr_to_paddr(void* address);
-void expand_conv_arg(ConvArgs* arg);
-void expand_EW_arg(EWAddArgs* arg);
-inline int32_t convertmantissa(int32_t i);
-
-uint32_t paddle_mobile_version();
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/fpga/common/pe.h b/mobile/src/fpga/common/pe.h
deleted file mode 100644
index cf0574bc04b05d538766ecba895e97944e1233f8..0000000000000000000000000000000000000000
--- a/mobile/src/fpga/common/pe.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include "fpga/common/fpga_common.h"
-
-namespace paddle_mobile {
-namespace fpga {
-
-uint64_t FPGAVersion();
-int PerformBypass(const struct BypassArgs& args);
-int ComputeBasicConv(const struct ConvArgs& args);
-int ComputeFpgaPool(const struct PoolingArgs& args);
-int ComputeFpgaEWAdd(const struct EWAddArgs& args);
-
-int ComputeFpgaConv(const struct SplitConvArgs& args);
-int ComputeFPGAConcat(const struct ConcatArgs& args);
-int ComputeFPGASplit(const struct SplitArgs& args);
-int ComputeFpgaDeconv(const struct DeconvArgs& args);
-int ComputeDWConv(const struct DWconvArgs& args);
-int ComputeDWDeconv(const struct DWDeconvArgs& args);
-
-}  // namespace fpga
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/CMakeLists.txt b/mobile/src/framework/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mobile/src/framework/attribute.cpp b/mobile/src/framework/attribute.cpp
deleted file mode 100644
index 8b150f4e9e6aa3ccc30f13f661ff9cd6be79ae7a..0000000000000000000000000000000000000000
--- a/mobile/src/framework/attribute.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "attribute.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct PrintVistor : Vistor<Print &> {
-  explicit PrintVistor(Print &printer) : printer_(printer) {}
-  template <typename T>
-  Print &operator()(const T &value) {
-    printer_ << value;
-    return printer_;
-  }
-
- private:
-  Print &printer_;
-};
-
-Print &operator<<(Print &printer, const Attribute &attr) {
-  Attribute::ApplyVistor(PrintVistor(printer), attr);
-  //  std::vector<std::string> v = {"1", "2"};
-  //  printer << (v);
-  return printer;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/attribute.h b/mobile/src/framework/attribute.h
deleted file mode 100644
index ece55f99b6973bc9aa8878b5fe8c77cb202ec3d5..0000000000000000000000000000000000000000
--- a/mobile/src/framework/attribute.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <string>
-#include <typeinfo>
-#include <unordered_map>
-#include <vector>
-
-#include "common/enforce.h"
-#include "common/log.h"
-#include "common/variant.h"
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-using std::string;
-using std::vector;
-
-class BlockDesc;
-
-class Attribute {
- public:
-  static Attribute GetAttrValue(
-      PaddleMobile__Framework__Proto__OpDesc__Attr *attr_desc) {
-    Attribute attr;
-    switch (attr_desc->type) {
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN: {
-        attr.Set<bool>(attr_desc->b);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT: {
-        attr.Set<int>(attr_desc->i);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT: {
-        attr.Set<float>(attr_desc->f);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING: {
-        attr.Set<std::string>(attr_desc->s);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS: {
-        vector<bool> val(attr_desc->n_bools);
-        for (int i = 0; i < attr_desc->n_bools; ++i) {
-          val[i] = attr_desc->bools[i];
-        }
-        attr.Set<vector<bool>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS: {
-        vector<int> val(attr_desc->n_ints);
-        for (int i = 0; i < attr_desc->n_ints; ++i) {
-          val[i] = attr_desc->ints[i];
-        }
-        attr.Set<vector<int>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS: {
-        vector<float> val(attr_desc->n_floats);
-        for (int i = 0; i < attr_desc->n_floats; ++i) {
-          val[i] = attr_desc->floats[i];
-        }
-        attr.Set<vector<float>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS: {
-        vector<string> val(attr_desc->n_strings);
-        for (int i = 0; i < attr_desc->n_strings; ++i) {
-          val[i] = attr_desc->strings[i];
-        }
-        attr.Set<vector<string>>(val);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG: {
-        attr.Set<int64_t>(attr_desc->l);
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK: {
-        break;
-      }
-      case PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS: {
-        vector<int> val(attr_desc->n_longs);
-        for (int i = 0; i < attr_desc->n_longs; ++i) {
-          val[i] = attr_desc->longs[i];
-        }
-        attr.Set<vector<int>>(val);
-        break;
-      }
-      default:
-        PADDLE_MOBILE_THROW_EXCEPTION("attr type not support");
-    }
-    return attr;
-  }
-
-  Attribute() {}
-  template <typename T, typename... Args>
-  Attribute &Set(Args &&... args) {
-    variant_.Set<T>(args...);
-    return *this;
-  }
-
-  template <typename T>
-  T &Get() const {
-    return variant_.Get<T>();
-  }
-
-  std::string GetString() const { return variant_.Get<std::string>(); }
-
-  template <typename Vistor>
-  static typename Vistor::type_t ApplyVistor(Vistor vistor, Attribute attr) {
-    if (attr.variant_.TypeId() == type_id<int>()) {  // NOLINT
-      return vistor(attr.variant_.Get<int>());
-    } else if (attr.variant_.TypeId() == type_id<float>()) {  // NOLINT
-      return vistor(attr.variant_.Get<float>());
-    } else if (attr.variant_.TypeId() == type_id<string>()) {
-      return vistor(attr.variant_.Get<std::string>());
-    } else if (attr.variant_.TypeId() == type_id<vector<int>>()) {
-      return vistor(attr.variant_.Get<vector<int>>());
-    } else if (attr.variant_.TypeId() == type_id<vector<float>>()) {
-      return vistor(attr.variant_.Get<vector<float>>());
-    } else if (attr.variant_.TypeId() == type_id<vector<string>>()) {
-      return vistor(attr.variant_.Get<vector<string>>());
-    } else if (attr.variant_.TypeId() == type_id<bool>()) {  // NOLINT
-      return vistor(attr.variant_.Get<bool>());
-    } else if (attr.variant_.TypeId() == type_id<vector<bool>>()) {
-      return vistor(attr.variant_.Get<vector<bool>>());
-    } else if (attr.variant_.TypeId() == type_id<int64_t>()) {
-      return vistor(attr.variant_.Get<int64_t>());
-    } else if (attr.variant_.TypeId() == type_id<framework::BlockDesc *>()) {
-      return vistor(attr.variant_.Get<framework::BlockDesc *>());
-    } else if (attr.variant_.TypeId() ==
-               type_id<vector<framework::BlockDesc *>>()) {
-      return vistor(attr.variant_.Get<vector<framework::BlockDesc *>>());
-    } else if (attr.variant_.TypeId() == type_id<vector<int64_t>>()) {
-      return vistor(attr.variant_.Get<vector<int64_t>>());
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("type not support");
-    }
-  }
-
- private:
-  Variant<int, float, string, vector<int>, vector<float>, vector<string>, bool,
-          vector<bool>, BlockDesc *, vector<BlockDesc *>, int64_t,
-          vector<int64_t>>
-      variant_;
-};
-
-using AttributeMap = std::unordered_map<string, Attribute>;
-
-class AttrReader {
- public:
-  explicit AttrReader(const AttributeMap &attrs) : attrs_(attrs) {}
-
-  template <typename T>
-  inline T Get(const string &name) const {
-    PADDLE_MOBILE_ENFORCE(attrs_.count(name) != 0,
-                          "%s should  be in AttributeMap", name.c_str());
-    return ((Attribute)attrs_.at(name)).Get<T>();
-  }
-
- private:
-  const AttributeMap &attrs_;
-};
-
-Print &operator<<(Print &printer, const Attribute &op_desc);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h
deleted file mode 100644
index 731e5de663cd7af63a5a981dfb1d46f82101d6b8..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_deleter.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CL/cl.h"
-#include "common/log.h"
-struct CLKernelDeleter {
-  template <class T>
-  void operator()(T *clKernelObj) {
-    const cl_int status = clReleaseKernel(clKernelObj);
-    LOG(paddle_mobile::kNO_LOG) << "clReleaseKernel  status:     " << status;
-  }
-};
-
-struct CLMemDeleter {
-  template <class T>
-  void operator()(T *clMemObj) {
-    const cl_int status = clReleaseMemObject(clMemObj);
-    LOG(paddle_mobile::kNO_LOG) << "CLMemDeleter  status:     " << status;
-  }
-};
-
-struct CLEventDeleter {
-  template <class T>
-  void operator()(T *clEventObj) {
-    const cl_int status = clReleaseEvent(clEventObj);
-    LOG(paddle_mobile::kNO_LOG) << "CLEventDeleter  status:     " << status;
-  }
-};
-
-struct CLCommQueueDeleter {
-  template <class T>
-  void operator()(T *clQueueObj) {
-    const cl_int status = clReleaseCommandQueue(clQueueObj);
-    LOG(paddle_mobile::kNO_LOG) << "CLCommQueueDeleter  status:     " << status;
-  }
-};
-
-struct CLContextDeleter {
-  template <class T>
-  void operator()(T *clContextObj) {
-    const cl_int status = clReleaseContext(clContextObj);
-    LOG(paddle_mobile::kNO_LOG) << "CLContextDeleter  status:     " << status;
-  }
-};
-
-struct CLProgramDeleter {
-  template <class T>
-  void operator()(T *clProgramObj) {
-    const cl_int status = clReleaseProgram(clProgramObj);
-    LOG(paddle_mobile::kNO_LOG) << "CLProgramDeleter  status:   " << status;
-  }
-};
diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp
deleted file mode 100644
index e8a8361eac71083d126b9ca4c22a098c6a9192fe..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_engine.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_engine.h"
-#include "CL/cl.h"
-#include "framework/cl/cl_tool.h"
-
-#include <cstdlib>
-#include <cstring>
-
-namespace paddle_mobile {
-namespace framework {
-
-bool CLEngine::Init() {
-  LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init()";
-  if (initialized_) {
-    return true;
-  }
-  LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init() ...";
-  cl_int status;
-  bool is_setplatform_success = SetPlatform();
-  bool is_setcldeviceid_success = SetClDeviceId();
-  is_init_success_ = is_setplatform_success && is_setcldeviceid_success;
-  initialized_ = true;
-  return initialized_;
-  //  setClCommandQueue();
-  //  std::string filename = "./HelloWorld_Kernel.cl";
-  //  loadKernelFromFile(filename.c_str());
-  //  buildProgram();
-}
-
-CLEngine *CLEngine::Instance() {
-  static CLEngine cl_engine_;
-  cl_engine_.Init();
-  return &cl_engine_;
-}
-
-bool CLEngine::isInitSuccess() { return is_init_success_; }
-bool CLEngine::SetPlatform() {
-  platform_ = NULL;      // the chosen platform
-  cl_uint numPlatforms;  // the NO. of platforms
-  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
-  if (status != CL_SUCCESS) {
-    return false;
-  }
-  /**For clarity, choose the first available platform. */
-  LOG(paddle_mobile::kNO_LOG) << "numPlatforms: " << numPlatforms;
-  if (numPlatforms > 0) {
-    cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
-        malloc(numPlatforms * sizeof(cl_platform_id)));
-    status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-    platform_ = platforms[0];
-    free(platforms);
-    LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_;
-    return status == CL_SUCCESS;
-  }
-
-  return false;
-}
-
-bool CLEngine::SetClDeviceId() {
-  cl_uint numDevices = 0;
-  LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_;
-  cl_int status =
-      clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
-  if (status != CL_SUCCESS) {
-    return false;
-  }
-  LOG(paddle_mobile::kNO_LOG) << "numDevices: " << numDevices;
-
-  if (numDevices > 0) {
-    status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
-                            NULL);
-    LOG(paddle_mobile::kNO_LOG) << "devices_[0]" << devices_[0];
-    return status == CL_SUCCESS;
-  }
-  return false;
-}
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h
deleted file mode 100644
index 2a6362ebc06c1e99a0e26502b4da0883732c9112..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_engine.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstring>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "CL/cl.h"
-#include "common/enforce.h"
-#include "common/log.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_tool.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLLocalWorkSizeInfo {
- public:
-  CLLocalWorkSizeInfo() {
-    max_work_group_size = 0;
-    max_work_item_size0 = 0;
-    max_work_item_size1 = 0;
-    max_work_item_size2 = 0;
-  }
-  CLLocalWorkSizeInfo(size_t total_size, size_t size0, size_t size1,
-                      size_t size2) {
-    max_work_group_size = total_size;
-    max_work_item_size0 = size0;
-    max_work_item_size1 = size1;
-    max_work_item_size2 = size2;
-  }
-  bool isEmpty() {
-    return max_work_group_size == 0 && max_work_item_size0 == 0 &&
-           max_work_item_size1 == 0 && max_work_item_size2 == 0;
-  }
-
-  // max total number of work-items in the work-group
-  size_t max_work_group_size;
-  // max number of work-items in local_work_size in dim 0
-  size_t max_work_item_size0;
-  // max number of work-items in local_work_size in dim 1
-  size_t max_work_item_size1;
-  // max number of work-items in local_work_size in dim 2
-  size_t max_work_item_size2;
-};
-inline void ctx_info(const char *errinfo, const void *private_info, size_t cb,
-                     void *user_data) {
-  fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo);
-}
-class CLEngine {
- public:
-  static CLEngine *Instance();
-
-  bool Init();
-  bool isInitSuccess();
-
-  std::shared_ptr<_cl_context> CreateContext() {
-    DLOG << "CreateContext ---";
-    DLOG << "platform: " << platform_;
-    DLOG << "devices_[0]: " << devices_[0];
-
-    cl_int status;
-    cl_context c = clCreateContext(NULL, 1, devices_, &ctx_info, NULL, &status);
-    std::shared_ptr<_cl_context> context(c, CLContextDeleter());
-    CL_CHECK_ERRORS(status);
-    return std::move(context);
-  }
-
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
-      cl_context context) {
-    cl_int status;
-    cl_command_queue queue =
-        clCreateCommandQueue(context, devices_[0], 0, &status);
-    std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ptr(
-        queue);
-    CL_CHECK_ERRORS(status);
-    return std::move(command_queue_ptr);
-  }
-
-  cl_context getContext() {
-    if (context_.get() == nullptr) {
-      context_ = CreateContext();
-    }
-    return context_.get();
-  }
-
-  cl_command_queue getClCommandQueue() {
-    if (command_queue_.get() == nullptr) {
-      command_queue_ = CreateClCommandQueue(getContext());
-    }
-    return command_queue_.get();
-  }
-
-  CLLocalWorkSizeInfo getLocalWorkSizeInfo() {
-    if (!localWorkSizeInfo_.isEmpty()) {
-      return localWorkSizeInfo_;
-    }
-    cl_int status;
-    size_t max_work_group_size = 0;
-    status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_GROUP_SIZE,
-                             sizeof(size_t), &max_work_group_size, NULL);
-    if (status != CL_SUCCESS) {
-      return CLLocalWorkSizeInfo(0, 0, 0, 0);
-    }
-    cl_uint max_dims_num = 0;
-    status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
-                             sizeof(cl_uint), &max_dims_num, NULL);
-    if (status != CL_SUCCESS) {
-      return CLLocalWorkSizeInfo(0, 0, 0, 0);
-    }
-    DLOG << "max_work_item_sizes max_dims_num: " << max_dims_num;
-    size_t *max_work_item_sizes =
-        reinterpret_cast<size_t *>(calloc(max_dims_num, sizeof(size_t)));
-    size_t ret_size = 0;
-    status = clGetDeviceInfo(devices_[0], CL_DEVICE_MAX_WORK_ITEM_SIZES,
-                             max_dims_num * sizeof(size_t), max_work_item_sizes,
-                             &ret_size);
-    if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) {
-      return CLLocalWorkSizeInfo(0, 0, 0, 0);
-    }
-    DLOG << " max_work_item_sizes {" << max_work_item_sizes[0] << ", "
-         << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "}";
-
-    localWorkSizeInfo_ =
-        CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0],
-                            max_work_item_sizes[1], max_work_item_sizes[2]);
-    free(max_work_item_sizes);
-    return localWorkSizeInfo_;
-  }
-  size_t GetKernelWorkSize(cl_kernel kernel) {
-    cl_int status;
-    size_t kernel_work_size = 0;
-    status =
-        clGetKernelWorkGroupInfo(kernel, devices_[0], CL_KERNEL_WORK_GROUP_SIZE,
-                                 sizeof(size_t), &kernel_work_size, NULL);
-    if (status != CL_SUCCESS) {
-      return 0;
-    }
-    DLOG << "kernel_work_size: " << kernel_work_size;
-    return kernel_work_size;
-  }
-
-  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWith(
-      cl_context context, std::string file_name) {
-    FILE *file = fopen(file_name.c_str(), "rb");
-    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                          file_name.c_str());
-    fseek(file, 0, SEEK_END);
-    int64_t size = ftell(file);
-    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-    rewind(file);
-    char *data = new char[size + 1];
-    size_t bytes_read = fread(data, 1, size, file);
-    data[size] = '\0';
-    PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                          "read binary file bytes do not match with fseek");
-    fclose(file);
-
-    const char *source = data;
-    size_t sourceSize[] = {strlen(source)};
-    cl_program p =
-        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
-
-    DLOG << " cl kernel file name: " << file_name;
-    DLOG << " source size: " << sourceSize[0];
-    CL_CHECK_ERRORS(status_);
-
-    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
-
-    return std::move(program_ptr);
-  }
-
-  std::unique_ptr<_cl_program, CLProgramDeleter> CreateProgramWithSource(
-      cl_context context, const char *source) {
-    size_t sourceSize[] = {strlen(source)};
-    cl_program p =
-        clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
-
-    LOG(kLOG_DEBUG4) << " cl kernel from source";
-    LOG(kLOG_DEBUG4) << " source size: " << sourceSize[0];
-    CL_CHECK_ERRORS(status_);
-
-    std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
-
-    return std::move(program_ptr);
-  }
-
-  std::unique_ptr<_cl_event, CLEventDeleter> CreateEvent(cl_context context) {
-    cl_event event = clCreateUserEvent(context, &status_);
-    std::unique_ptr<_cl_event, CLEventDeleter> event_ptr(event);
-    CL_CHECK_ERRORS(status_);
-    return std::move(event_ptr);
-  }
-
-  bool BuildProgram(cl_program program, const std::string &options = "") {
-    cl_int status;
-    std::string path = options + " -cl-fast-relaxed-math";
-
-    status = clBuildProgram(program, 0, 0, path.c_str(), 0, 0);
-
-    CL_CHECK_ERRORS(status);
-
-    if (status == CL_BUILD_PROGRAM_FAILURE) {
-      size_t log_size;
-      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
-                            CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-      char *log = reinterpret_cast<char *>(malloc(log_size));
-      clGetProgramBuildInfo(program, CLEngine::Instance()->DeviceID(),
-                            CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
-      DLOG << " program build error: " << log;
-    }
-
-    return status == CL_SUCCESS;
-  }
-
-  cl_device_id DeviceID(int index = 0) { return devices_[index]; }
-
-  std::string GetCLPath() { return cl_path_; }
-  void setClPath(std::string cl_path) { cl_path_ = cl_path; }
-
- private:
-  CLEngine() { initialized_ = false; }
-
-  bool SetPlatform();
-
-  bool SetClDeviceId();
-
-  bool initialized_;
-
-  CLLocalWorkSizeInfo localWorkSizeInfo_;
-
-  cl_int status_;
-  std::string cl_path_;
-  bool is_init_success_ = false;
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
-  std::shared_ptr<_cl_context> context_;
-  cl_device_id devices_[10];
-  cl_platform_id platform_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_half.cpp b/mobile/src/framework/cl/cl_half.cpp
deleted file mode 100644
index 2877289325d983d0c7d9756732254e0a4ed831b6..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_half.cpp
+++ /dev/null
@@ -1,518 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
-
-#include "framework/cl/cl_half.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-static const uint32_t mantissatable[2048] = {
-    0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
-    0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
-    0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
-    0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
-    0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
-    0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
-    0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
-    0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
-    0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
-    0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
-    0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
-    0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
-    0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
-    0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
-    0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
-    0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
-    0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
-    0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
-    0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
-    0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
-    0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
-    0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
-    0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
-    0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
-    0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
-    0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
-    0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
-    0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
-    0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
-    0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
-    0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
-    0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
-    0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
-    0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
-    0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
-    0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
-    0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
-    0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
-    0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
-    0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
-    0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
-    0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
-    0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
-    0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
-    0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
-    0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
-    0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
-    0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
-    0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
-    0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
-    0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
-    0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
-    0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
-    0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
-    0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
-    0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
-    0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
-    0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
-    0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
-    0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
-    0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
-    0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
-    0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
-    0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
-    0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
-    0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
-    0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
-    0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
-    0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
-    0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
-    0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
-    0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
-    0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
-    0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
-    0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
-    0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
-    0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
-    0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
-    0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
-    0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
-    0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
-    0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
-    0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
-    0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
-    0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
-    0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
-    0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
-    0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
-    0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
-    0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
-    0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
-    0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
-    0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
-    0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
-    0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
-    0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
-    0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
-    0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
-    0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
-    0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
-    0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
-    0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
-    0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
-    0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
-    0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
-    0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
-    0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
-    0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
-    0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
-    0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
-    0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
-    0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
-    0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
-    0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
-    0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
-    0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
-    0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
-    0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
-    0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
-    0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
-    0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
-    0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
-    0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
-    0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
-    0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
-    0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
-    0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
-    0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
-    0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
-    0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
-    0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
-    0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
-    0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
-    0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
-    0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
-    0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
-    0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
-    0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
-    0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
-    0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
-    0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
-    0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
-    0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
-    0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
-    0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
-    0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
-    0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
-    0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
-    0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
-    0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
-    0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
-    0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
-    0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
-    0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
-    0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
-    0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
-    0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
-    0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
-    0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
-    0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
-    0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
-    0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
-    0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
-    0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
-    0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
-    0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
-    0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
-    0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
-    0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
-    0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
-    0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
-    0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
-    0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
-    0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
-    0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
-    0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
-    0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
-    0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
-    0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
-    0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
-    0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
-    0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
-    0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
-    0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
-    0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
-    0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
-    0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
-    0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
-    0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
-    0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
-    0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
-    0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
-    0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
-    0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
-    0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
-    0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
-    0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
-    0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
-    0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
-    0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
-    0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
-    0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
-    0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
-    0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
-    0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
-    0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
-    0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
-    0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
-    0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
-    0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
-    0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
-    0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
-    0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
-    0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
-    0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
-    0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
-    0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
-    0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
-    0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
-    0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
-    0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
-    0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
-    0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
-    0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
-    0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
-    0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
-    0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
-    0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
-    0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
-    0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
-    0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
-    0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
-    0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
-    0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
-    0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
-    0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
-    0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
-    0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
-    0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
-    0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
-    0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
-    0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
-    0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
-    0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
-    0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
-    0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
-    0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
-    0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
-    0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
-    0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
-    0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
-    0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
-    0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
-    0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
-    0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
-    0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
-    0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
-    0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
-    0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
-    0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
-    0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
-    0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
-    0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
-    0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
-    0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
-    0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
-    0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
-    0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
-    0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
-    0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
-    0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
-    0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
-    0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
-    0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
-    0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
-    0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
-    0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
-    0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
-    0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
-    0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
-    0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
-    0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
-    0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
-    0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
-    0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
-    0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
-    0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
-    0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
-    0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
-    0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
-    0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
-    0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
-    0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
-    0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
-    0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
-    0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
-    0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
-    0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
-    0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
-    0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
-    0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
-    0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
-    0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
-    0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
-    0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
-    0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
-    0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
-    0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
-    0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
-    0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
-    0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
-    0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
-    0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
-    0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
-    0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
-    0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
-    0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
-    0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
-    0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
-    0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
-    0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
-    0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
-    0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
-    0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
-    0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
-    0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
-    0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
-    0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
-    0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
-    0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
-    0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
-    0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
-    0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
-    0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
-    0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
-    0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
-    0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
-    0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
-    0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
-    0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
-    0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
-    0x387fc000, 0x387fe000};
-
-static const uint16_t offsettable[64] = {
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
-    0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400};
-
-static const uint32_t exponenttable[64] = {
-    0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
-    0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
-    0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
-    0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
-    0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
-    0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
-    0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
-    0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
-    0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
-    0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
-    0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000};
-
-static const uint16_t basetable[512] = {
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
-    0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010,
-    0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000,
-    0x1400, 0x1800, 0x1c00, 0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400,
-    0x3800, 0x3c00, 0x4000, 0x4400, 0x4800, 0x4c00, 0x5000, 0x5400, 0x5800,
-    0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00, 0x7000, 0x7400, 0x7800, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
-    0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
-    0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
-    0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200,
-    0x8400, 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400,
-    0xa800, 0xac00, 0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800,
-    0xcc00, 0xd000, 0xd400, 0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00,
-    0xf000, 0xf400, 0xf800, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
-    0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00};
-
-static const uint8_t shifttable[512] = {
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
-    0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
-    0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
-    0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
-    0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d};
-
-half_t Float2Half(float f) {
-  uint32_t v = *reinterpret_cast<uint32_t *>(&f);
-  return basetable[(v >> 23) & 0x1ff] +
-         ((v & 0x007fffff) >> shifttable[(v >> 23) & 0x1ff]);
-}
-
-float Half2Float(half_t h) {
-  uint32_t v = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] +
-               exponenttable[h >> 10];
-  return *reinterpret_cast<float *>(&v);
-}
-
-void FloatArray2HalfArray(float *f_array, half_t *h_array, int count) {
-  for (int i = 0; i < count; ++i) {
-    h_array[i] = Float2Half(f_array[i]);
-  }
-}
-
-void HalfArray2FloatArray(half_t *h_array, float *f_array, int count) {
-  for (int i = 0; i < count; ++i) {
-    f_array[i] = Half2Float(h_array[i]);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_half.h b/mobile/src/framework/cl/cl_half.h
deleted file mode 100644
index 9b05740f1e19af66036a1562243102e5ba42ab1b..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_half.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstdint>
-
-namespace paddle_mobile {
-namespace framework {
-
-typedef uint16_t half_t;
-
-half_t Float2Half(float f);
-
-float Half2Float(half_t h);
-
-void FloatArray2HalfArray(float *f_array, half_t *h_array, int count);
-
-void HalfArray2FloatArray(half_t *h_array, float *f_array, int count);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h
deleted file mode 100644
index db9aa37ae2b7219131b5950e54ec008828f1fc70..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_helper.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "common/log.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_image.h"
-#include "framework/cl/cl_scope.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLHelper {
- public:
-  CLHelper() = default;
-
-  explicit CLHelper(CLScope *scope) : scope_(scope) {}
-
-  void AddKernel(const std::string &kernel_name, const std::string &file_name,
-                 const std::string &options = "") {
-    LOG(kLOG_DEBUG1) << " begin add kernel ";
-    auto kernel = scope_->GetKernel(kernel_name, file_name, options);
-    LOG(kLOG_DEBUG1) << " begin add kernel ";
-    kernels.emplace_back(std::move(kernel));
-  }
-
-  cl_kernel KernelAt(const int index) {
-    DLOG << " kernel count: " << kernels.size();
-    return kernels[index].get();
-  }
-
-  cl_command_queue CLCommandQueue() { return scope_->CommandQueue(); }
-
-  cl_context CLContext() { return scope_->Context(); }
-
-  CLLocalWorkSizeInfo LocalWorkSizeInfo() {
-    return scope_->LocalWorkSizeInfo();
-  }
-  size_t KernelWorkSize(cl_kernel kernel) {
-    return scope_->KernelWorkSize(kernel);
-  }
-
-  std::vector<size_t> DefaultWorkSize(const CLImage &image) {
-    // n c h w
-    auto image_dim = image.dims();
-    if (image_dim.size() == 4) {
-      auto n = image_dim[0];
-      auto h = image_dim[2];
-      auto w = image_dim[3];
-      auto image_width = image.ImageWidth();
-      size_t work_size_0 = image_width / w;
-      size_t work_size_1 = w;
-      size_t work_size_2 = n * h;
-      return {work_size_0, work_size_1, work_size_2};
-    } else if (image_dim.size() == 2) {
-      auto h = image_dim[0];
-      auto w = image_dim[1];
-      return {1, image.ImageWidth(), image.ImageHeight()};
-    } else if (image_dim.size() == 1) {
-      return {1, image.ImageWidth(), 1};
-    } else if (image_dim.size() == 3) {
-      size_t c = image_dim[0];
-      size_t h = image_dim[1];
-      size_t w = image_dim[2];
-      return {(c + 3) / 4, w, h};
-    }
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support this dim, need imp ");
-  }
-
- private:
-  CLScope *scope_;
-  std::vector<std::unique_ptr<_cl_kernel, CLKernelDeleter>> kernels;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image.cpp b/mobile/src/framework/cl/cl_image.cpp
deleted file mode 100644
index 1b8966742d77db8c63d89ab4ca8176494ba7cab0..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_image.cpp
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_image.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-void CLImage::PrintTensor(const CLImage &cl_image) const {
-  size_t width = cl_image.ImageDims()[0];
-  size_t height = cl_image.ImageDims()[1];
-
-  half_t *image_data = new half_t[height * width * 4];
-  cl_int err;
-  cl_mem image = cl_image.GetCLImage();
-  size_t origin[3] = {0, 0, 0};
-  size_t region[3] = {width, height, 1};
-  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
-                           region, 0, 0, image_data, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(err);
-
-  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
-                        "cl_image numel should not be 0 ");
-  float *tensor_data = new float[cl_image.numel()];
-  auto converter = cl_image.Converter();
-  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
-                         cl_image.dims());
-  int stride = cl_image.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-
-  for (int i = 0; i < cl_image.numel(); i++) {
-    printf("%f \n", tensor_data[i]);
-  }
-
-  delete[](tensor_data);
-  delete[](image_data);
-}
-
-void CLImageToTensor(CLImage *cl_image, Tensor *tensor, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel) {
-  tensor->mutable_data<float>();
-  const auto &dim = cl_image->dims();
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < dim.size(); ++j) {
-    new_dims[4 - dim.size() + j] = dim[j];
-  }
-  size_t C, in_height, in_width;
-
-  C = new_dims[1];
-  in_height = new_dims[2];
-  in_width = new_dims[3];
-
-  CLTensor out_cl_tensor(context, commandQueue);
-  out_cl_tensor.Resize(tensor->dims());
-  cl_mem outBuffer = out_cl_tensor.mutable_data<float>();
-
-  auto input_image = cl_image->GetCLImage();
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  CL_CHECK_ERRORS(status);
-  int size_ch = in_height * in_width;
-  int size_block = size_ch * 4;
-  int size_batch = size_ch * C;
-  status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
-                                new_dims[0] * new_dims[2]};
-  status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
-                                  global_work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-  memcpy(tensor->data<float>(), out_cl_tensor.Data<float>(),
-         tensor->memory_size());
-}
-
-void TensorToCLImage(Tensor *tensor, CLImage *cl_image, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel) {
-  const auto &dim = cl_image->dims();
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < dim.size(); ++j) {
-    new_dims[4 - dim.size() + j] = dim[j];
-  }
-  cl_int status;
-  auto output = cl_image;
-  const Tensor *input = tensor;
-  const float *input_data = input->data<float>();
-  auto output_image = output->GetCLImage();
-  const int out_C = new_dims[1];
-  const int out_H = new_dims[2];
-  const int out_W = new_dims[3];
-  const int Stride2 = out_C * out_H * out_W;
-  const int Stride1 = out_H * out_W;
-  const int Stride0 = out_W;
-  DLOG << out_C;
-  DLOG << out_H;
-  DLOG << out_W;
-  CLTensor input_cl_tensor(context, commandQueue);
-  input_cl_tensor.Resize(input->dims());
-  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  size_t global_work_size[3] = {(new_dims[1] + 3) / 4, new_dims[3],
-                                new_dims[0] * new_dims[2]};
-  status = clEnqueueNDRangeKernel(commandQueue, kernel, 3, NULL,
-                                  global_work_size, NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const CLImage &cl_image) {
-  size_t width = cl_image.ImageDims()[0];
-  size_t height = cl_image.ImageDims()[1];
-
-  half_t *image_data = new half_t[height * width * 4];
-  cl_int err;
-  cl_mem image = cl_image.GetCLImage();
-  size_t origin[3] = {0, 0, 0};
-  size_t region[3] = {width, height, 1};
-  err = clEnqueueReadImage(cl_image.CommandQueue(), image, CL_TRUE, origin,
-                           region, 0, 0, image_data, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(err);
-
-  PADDLE_MOBILE_ENFORCE(cl_image.numel() != 0,
-                        "cl_image numel should not be 0 ");
-  float *tensor_data = new float[cl_image.numel()];
-  auto converter = cl_image.Converter();
-  converter->ImageToNCHW(image_data, tensor_data, cl_image.ImageDims(),
-                         cl_image.dims());
-  int stride = cl_image.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-
-  printer << " dims: " << cl_image.dims() << "\n";
-  for (int i = 0; i < cl_image.numel(); i += stride) {
-    printer << tensor_data[i] << " ";
-  }
-
-  delete[](tensor_data);
-  delete[](image_data);
-
-  return printer;
-}
-#endif
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h
deleted file mode 100644
index 57656c3c6d995f9e9c2b5bb8e921b44310d3bbd5..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_image.h
+++ /dev/null
@@ -1,338 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <iostream>
-#include <memory>
-#include <vector>
-
-#include "CL/cl.h"
-
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_half.h"
-#include "framework/cl/cl_image_converter.h"
-#include "framework/cl/cl_tool.h"
-#include "framework/ddim.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLImage {
- public:
-  CLImage() = default;
-
-  ~CLImage() {
-    if (tensor_data_ != nullptr) {
-      delete[](tensor_data_);
-    }
-
-    if (image_converter_) {
-      delete (image_converter_);
-    }
-  }
-  /*
-   * will not hold input tensor data, memcpy in this method
-   * */
-  void SetTensorData(float *tensorData, const DDim &dim) {
-    int numel = product(dim);
-    if (tensor_data_ != nullptr) {
-      delete[](tensor_data_);
-      tensor_data_ = nullptr;
-    }
-    tensor_data_ = new float[numel];
-    memcpy(tensor_data_, tensorData, numel * sizeof(float));
-    tensor_dims_ = dim;
-  }
-
-  bool isInit() { return initialized_; }
-  /*
-   * need call SetTensorData first
-   *
-   * folder when one dim or two dim
-   * */
-  void InitCLImage(cl_context context, cl_command_queue command_queue) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-    CLImageConverterFolder *folder_converter = new CLImageConverterFolder();
-    InitCLImage(context, command_queue, folder_converter);
-  }
-
-  void InitNormalCLImage(cl_context context, cl_command_queue command_queue) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-    InitCLImage(context, command_queue, normal_converter);
-  }
-
-  void InitCLImage(cl_context context, cl_command_queue command_queue,
-                   CLImageConverterBase *converter) {
-    if (image_converter_ != nullptr) {
-      delete (image_converter_);
-    }
-
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-
-    LOG(kNO_LOG) << " begin init cl image ";
-    image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
-
-    half_t *image_data = new half_t[product(image_dims_) * 4];
-
-    LOG(kNO_LOG) << " convert to image";
-    converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
-    LOG(kNO_LOG) << " end convert to image";
-
-    InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
-
-    delete[](image_data);
-    delete[](tensor_data_);
-
-    command_queue_ = command_queue;
-    tensor_data_ = nullptr;
-    image_converter_ = converter;
-    initialized_ = true;
-    LOG(kNO_LOG) << " end init cl image";
-  }
-
-  void InitNImage(cl_context context, cl_command_queue command_queue) {
-    if (tensor_data_ == nullptr) {
-      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
-    }
-    CLImageConverterNWBlock *folder_converter = new CLImageConverterNWBlock();
-    InitCLImage(context, command_queue, folder_converter);
-    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
-  }
-  void InitDWImage(cl_context context, cl_command_queue command_queue) {
-    if (tensor_data_ == nullptr) {
-      PADDLE_MOBILE_THROW_EXCEPTION(" need call SetTensorData first");
-    }
-    CLImageConverterDWBlock *dw_converter = new CLImageConverterDWBlock();
-    InitCLImage(context, command_queue, dw_converter);
-    PADDLE_MOBILE_ENFORCE(tensor_dims_.size() == 4, " tensor dim is not 4");
-  }
-
-  void InitEmptyImage(cl_context context, cl_command_queue command_queue,
-                      const DDim &dim) {
-    if (image_converter_ != nullptr) {
-      delete image_converter_;
-    }
-    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
-                          " empty image tensor data shouldn't have value");
-
-    //    CLImageConverterFolder *folder_converter = new
-    //    CLImageConverterFolder();
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-    PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .")
-    //    LOG(kNO_LOG) << " to get image dims ";
-    image_dims_ = normal_converter->InitImageDimInfoWith(dim);
-    //    LOG(kNO_LOG) << " end get image dims " << image_dims_;
-
-    InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
-
-    tensor_dims_ = dim;
-    command_queue_ = command_queue;
-    image_converter_ = normal_converter;
-    cl_event_ = CLEngine::Instance()->CreateEvent(context);
-    initialized_ = true;
-    //    LOG(kNO_LOG) << " end init cl image";
-  }
-  /**
-   *  create fake size cl_mem for mem share
-   */
-  void InitFakeSizeImage(cl_context context, cl_command_queue command_queue,
-                         const DDim &need_dims, const DDim &real_image_dims) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ == nullptr,
-                          " empty image tensor data shouldn't have value");
-    if (image_converter_ != nullptr) {
-      delete image_converter_;
-    }
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-    // use real image dims to create mem
-    real_image_dims_ = real_image_dims;
-    // when init fake size image ,
-    // reinit image is allow , it is disallowed after this..
-    shared_mem_ = false;
-    InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr);
-    // cheat cl_image they got what they wanted
-    image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    LOG(kNO_LOG) << "InitFakeSizeImage ... ";
-    LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
-    LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
-    PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
-                              real_image_dims_[1] >= image_dims_[1],
-                          "real image is not enough");
-    tensor_dims_ = need_dims;
-    command_queue_ = command_queue;
-    image_converter_ = normal_converter;
-    cl_event_ = CLEngine::Instance()->CreateEvent(context);
-    initialized_ = true;
-    shared_mem_ = true;
-
-    LOG(kNO_LOG) << " end init FakeSizeImage";
-  }
-  /**
-   * init cl mem with a exist cl mem
-   */
-  void InitWithExistMem(cl_context context, cl_command_queue command_queue,
-                        DDim need_dims, const CLImage &src) {
-    if (image_converter_ != nullptr) {
-      delete image_converter_;
-    }
-    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
-
-    real_image_dims_ = src.real_image_dims_;
-    image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-
-    LOG(kNO_LOG) << "InitWithExistMem ... ";
-    LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
-    LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
-
-    if (real_image_dims_[0] < image_dims_[0] ||
-        real_image_dims_[1] < image_dims_[1]) {
-      LOG(kNO_LOG) << "real image is not enough!";
-      LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
-      LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
-    }
-    PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
-                              real_image_dims_[1] >= image_dims_[1],
-                          "real image is not enough!");
-    if (cl_image_ != src.cl_image_) {
-      cl_image_ = src.cl_image_;
-    }
-
-    tensor_dims_ = need_dims;
-    command_queue_ = command_queue;
-    image_converter_ = normal_converter;
-    cl_event_ = CLEngine::Instance()->CreateEvent(context);
-    initialized_ = true;
-    shared_mem_ = true;
-
-    LOG(kNO_LOG) << " end init WithExistMem";
-  }
-
-  void InitConv2dTransposeFilterCLImage(cl_context context,
-                                        cl_command_queue command_queue) {
-    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
-                          " need call SetTensorData first");
-    CLImageConverterConv2dTransposeTransWeight *converter =
-        new CLImageConverterConv2dTransposeTransWeight();
-    InitCLImage(context, command_queue, converter);
-  }
-
-  cl_mem GetCLImage() const { return cl_image_.get(); }
-
-  const DDim &ImageDims() const { return image_dims_; }
-
-  inline size_t ImageWidth() const { return image_dims_[0]; }
-
-  inline size_t ImageHeight() const { return image_dims_[1]; }
-
-  inline cl_command_queue CommandQueue() const { return command_queue_; }
-
-  /*
-   *  resize original tensor dim
-   * */
-  inline CLImage &Resize(const DDim &dims) {
-    tensor_dims_ = dims;
-    return *this;
-  }
-
-  template <typename T>
-  T *data() const {
-    if (initialized_) {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          " cl image has initialized, tensor data has been deleted, can't use "
-          "tensor data");
-    }
-    return reinterpret_cast<T *>(tensor_data_);
-  }
-
-  /*
-   *  numel of tensor dim
-   * */
-  inline int64_t numel() const { return product(tensor_dims_); }
-
-  /*
-   *  original tensor dim
-   * */
-  const DDim &dims() const { return tensor_dims_; }
-
-  cl_event GetClEvent() const { return cl_event_.get(); }
-
-  CLImageConverterBase *Converter() const { return image_converter_; }
-  void PrintTensor(const CLImage &cl_image) const;
-
- private:
-  void InitCLImage(cl_context context, size_t width, size_t height,
-                   void *data) {
-    PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .")
-
-    cl_image_format cf = {.image_channel_order = CL_RGBA,
-                          .image_channel_data_type = CL_HALF_FLOAT};
-    cl_image_desc cid = {
-        .image_type = CL_MEM_OBJECT_IMAGE2D,
-        .image_width = width,
-        .image_height = height,
-        .image_depth = 1,
-        .image_array_size = 1,
-        .image_row_pitch = 0,
-        .image_slice_pitch = 0,
-        .num_mip_levels = 0,
-        .num_samples = 0,
-        // .buffer = nullptr
-    };
-    cid.buffer = nullptr;
-    cl_int err;
-    cl_mem cl_image = clCreateImage(
-        context, CL_MEM_READ_WRITE | (data ? CL_MEM_COPY_HOST_PTR : 0),
-        &cf,   // const cl_image_format *image_format
-        &cid,  // const cl_image_desc *image_desc
-        data,  // void *host_ptr
-        &err);
-    cl_image_.reset(cl_image, CLMemDeleter());
-    if (err != CL_SUCCESS) {
-      CL_CHECK_ERRORS(err);
-      PADDLE_MOBILE_THROW_EXCEPTION(" create image 2d error ");
-    }
-  }
-
-  bool initialized_ = false;
-  std::shared_ptr<_cl_mem> cl_image_;
-  std::unique_ptr<_cl_event, CLEventDeleter> cl_event_;
-  DDim tensor_dims_;
-  DDim image_dims_;
-  // real image dims usually it is same as image_dims
-  DDim real_image_dims_;
-  float *tensor_data_ = nullptr;
-  cl_context context_;
-  cl_command_queue command_queue_;
-  CLImageConverterBase *image_converter_ = nullptr;
-  bool shared_mem_ = false;
-};
-
-void TensorToCLImage(Tensor *tensor, CLImage *image, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel);
-
-void CLImageToTensor(CLImage *image, Tensor *tensor, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel);
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const CLImage &image);
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image_converter.cpp b/mobile/src/framework/cl/cl_image_converter.cpp
deleted file mode 100644
index 277d3791520a34792c2eeeeba48a1644edbb26cb..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_image_converter.cpp
+++ /dev/null
@@ -1,510 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_image_converter.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-DDim CLImageConverterDefault::InitImageDimInfoWith(const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-  return make_ddim({width, height});
-}
-
-void CLImageConverterDefault::NCHWToImage(float *nchw, half_t *image,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  DLOG << " tensor dim " << tensor_dim;
-  DLOG << " image dim " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t height = in_image_dim[1];
-
-  int w_block = width / W;
-
-  float *p = nchw;
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < w_block * 4; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          if (c < C) {
-            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-            // (c % 4);
-            image[i2] = Float2Half(*p);
-            i2 += 4;
-            p++;
-          } else {
-            image[i2] = 0.0;
-            i2 += 4;
-          }
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-void CLImageConverterDefault::ImageToNCHW(half_t *image, float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  int width = image_dim[0];
-  int height = image_dim[0];
-
-  float *p = tensor;
-
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < C; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          *p = Half2Float(image[i2]);
-          i2 += 4;
-          p++;
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-DDim CLImageConverterFolder::InitImageDimInfoWith(const DDim &tensor_dim) {
-  if (tensor_dim.size() <= 2) {
-    int tdim[2] = {1, 1};
-    if (tensor_dim.size() == 1) {
-      tdim[1] = tensor_dim[0];
-    } else {
-      tdim[0] = tensor_dim[0];
-      tdim[1] = tensor_dim[1];
-    }
-    int width = (tdim[1] + 3) / 4;
-    int height = tdim[0];
-
-    width_of_one_block_ = width;
-    height_of_one_block_ = height;
-    c_block_ = 1;
-
-    return make_ddim({width, height});
-
-  } else {
-    size_t new_dims[] = {1, 1, 1, 1};
-    for (int j = 0; j < tensor_dim.size(); ++j) {
-      new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-    }
-    size_t N, C, H, W;
-    N = new_dims[0];
-    C = new_dims[1];
-    H = new_dims[2];
-    W = new_dims[3];
-    size_t width = W * ((C + 3) / 4);
-    size_t height = H * N;
-
-    width_of_one_block_ = W;
-    height_of_one_block_ = H;
-    c_block_ = width / W;
-
-    return make_ddim({width, height});
-  }
-}
-
-void CLImageConverterFolder::NCHWToImage(float *tensor, half_t *image,
-                                         const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
-                        "tensor dim is not support ");
-
-  if (tensor_dim.size() > 2) {
-    CLImageConverterDefault default_converter;
-    default_converter.NCHWToImage(tensor, image, tensor_dim);
-
-  } else {
-    int tdim[2] = {1, 1};
-    if (tensor_dim.size() == 1) {
-      tdim[1] = tensor_dim[0];
-    } else {
-      tdim[0] = tensor_dim[0];
-      tdim[1] = tensor_dim[1];
-    }
-
-    DDim image_dim = InitImageDimInfoWith(tensor_dim);
-    int width = image_dim[0];
-
-    for (int h = 0; h < tdim[0]; h++) {
-      for (int w = 0; w < tdim[1]; w++) {
-        image[(h * width + w / 4) * 4 + (w % 4)] =
-            Float2Half(tensor[h * tdim[1] + w]);
-      }
-    }
-  }
-}
-
-void CLImageConverterFolder::ImageToNCHW(half_t *image, float *tensor,
-                                         const DDim &image_dim,
-                                         const DDim &tensor_dim) {
-  if (tensor_dim.size() > 2) {
-    CLImageConverterDefault default_converter;
-    default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
-
-  } else {
-    int width = image_dim[0];
-    int height = image_dim[1];
-    int H, W;
-
-    if (tensor_dim.size() == 2) {
-      H = tensor_dim[0];
-      W = tensor_dim[1];
-    } else if (tensor_dim.size() == 1) {
-      H = 1;
-      W = tensor_dim[0];
-    }
-    float *p = tensor;
-
-    for (int h = 0; h < H; h++) {
-      for (int w = 0; w < W; w++) {
-        p[h * W + w] = Half2Float(image[(h * width + w / 4) * 4 + (w % 4)]);
-      }
-    }
-  }
-}
-
-DDim CLImageConverterNWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = W * ((N + 3) / 4);
-  size_t height = C * H;
-  return make_ddim({width, height});
-}
-
-void CLImageConverterNWBlock::NCHWToImage(float *tensor, half_t *image,
-                                          const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  auto image_dim = InitImageDimInfoWith(tensor_dim);
-  float *p = tensor;
-  int N = tensor_dim[0];
-  int C = tensor_dim[1];
-  int H = tensor_dim[2];
-  int W = tensor_dim[3];
-  int width = image_dim[0];
-  int height = image_dim[1];
-  int block = image_dim[0] / tensor_dim[3];
-
-  for (int n = 0; n < block * 4; n++) {
-    for (int c = 0; c < C; c++) {
-      for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
-                      w * 4 + n % 4;
-          if (n < N) {
-            image[index] = Float2Half(*p);
-            p++;
-          } else {
-            image[index] = 0.0;
-          }
-          if (index >= (width * height * 4)) {
-            DLOG << " index out of range ";
-          }
-        }
-      }
-    }
-  }
-  DLOG << " init done";
-}
-
-void CLImageConverterNWBlock::ImageToNCHW(half_t *image, float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  float *p = tensor;
-  int N = tensor_dim[0];
-  int C = tensor_dim[1];
-  int H = tensor_dim[2];
-  int W = tensor_dim[3];
-  int width = image_dim[0];
-  int height = image_dim[1];
-  int block = image_dim[0] / tensor_dim[3];
-
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < C; c++) {
-      for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-          int index = 4 * c * (width * H) + 4 * h * width + 4 * W * (n / 4) +
-                      w * 4 + n % 4;
-          *p = Half2Float(image[index]);
-          p++;
-          if (index >= (width * height * 4)) {
-            DLOG << " index out of range ";
-          }
-        }
-      }
-    }
-  }
-  DLOG << " init done";
-}
-
-DDim CLImageConverterDWBlock::InitImageDimInfoWith(const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = W * ((N + 3) / 4);
-  size_t height = C * H;
-  return make_ddim({width, height});
-}
-
-void CLImageConverterDWBlock::NCHWToImage(float *tensor, half_t *image,
-                                          const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  N = new_dims[1];
-  C = new_dims[0];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  DLOG << " tensor dim " << tensor_dim;
-  DLOG << " image dim " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t height = in_image_dim[1];
-
-  int w_block = width / W;
-
-  float *p = tensor;
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < w_block * 4; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          if (c < C) {
-            // int x = (n * width * H + h * width + (c / 4) * W + w) * 4 +
-            // (c % 4);
-            image[i2] = Float2Half(*p);
-            i2 += 4;
-            p++;
-          } else {
-            image[i2] = 0.0;
-            i2 += 4;
-          }
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-void CLImageConverterDWBlock::ImageToNCHW(half_t *image, float *tensor,
-                                          const DDim &image_dim,
-                                          const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  float *p = tensor;
-  int N = tensor_dim[1];
-  int C = tensor_dim[0];
-  int H = tensor_dim[2];
-  int W = tensor_dim[3];
-  int width = image_dim[0];
-  int height = image_dim[0];
-
-  size_t i0 = 0;
-  for (int n = 0; n < N; n++) {
-    for (int c = 0; c < C; c++) {
-      size_t i1 = i0 + (c / 4) * W;
-      for (int h = 0; h < H; h++) {
-        size_t i2 = (i1 << 2) + c % 4;
-        for (int w = 0; w < W; w++) {
-          *p = Half2Float(image[i2]);
-          i2 += 4;
-          p++;
-        }
-        i1 += width;
-      }
-    }
-    i0 += width * H;
-  }
-}
-
-DDim CLImageConverterNormal::InitImageDimInfoWith(const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
-                        "tensor dim is not support ");
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  N = new_dims[0];
-  C = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-
-  width_of_one_block_ = W;
-  height_of_one_block_ = H;
-  c_block_ = width / W;
-
-  return make_ddim({width, height});
-}
-
-void CLImageConverterNormal::NCHWToImage(float *tensor, half_t *image,
-                                         const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() <= 4 && tensor_dim.size() > 0,
-                        "tensor dim is not support ");
-
-  CLImageConverterDefault default_converter;
-  default_converter.NCHWToImage(tensor, image, tensor_dim);
-}
-
-void CLImageConverterNormal::ImageToNCHW(half_t *image, float *tensor,
-                                         const DDim &image_dim,
-                                         const DDim &tensor_dim) {
-  CLImageConverterDefault default_converter;
-  default_converter.ImageToNCHW(image, tensor, image_dim, tensor_dim);
-}
-
-DDim CLImageConverterWinoTransWeight::InitImageDimInfoWith(
-    const DDim &tensor_dim) {
-  PADDLE_MOBILE_ENFORCE(tensor_dim.size() == 4, " tensor dim is not 4");
-  size_t N, C, H, W;
-  N = tensor_dim[0];
-  C = tensor_dim[1];
-  H = tensor_dim[2];
-  W = tensor_dim[3];
-  size_t width = (C + 3) / 4;
-  size_t height = N * 16;  // N * (wino_blk_size + 2) * (wino_blk_size + 2)
-  return make_ddim({width, height});
-}
-
-void CLImageConverterWinoTransWeight::NCHWToImage(float *tensor, half_t *image,
-                                                  const DDim &tensor_dim) {}
-
-void CLImageConverterWinoTransWeight::ImageToNCHW(half_t *image, float *tensor,
-                                                  const DDim &image_dim,
-                                                  const DDim &tensor_dim) {}
-
-DDim CLImageConverterConv2dTransposeTransWeight::InitImageDimInfoWith(
-    const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-  size_t N, C, H, W;
-  C = new_dims[0];
-  N = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-  size_t width = W * ((C + 3) / 4);
-  size_t height = H * N;
-  return make_ddim({width, height});
-}
-
-// it is actually CNHW to Image, because conv2d_transpose's filter is CNHW
-void CLImageConverterConv2dTransposeTransWeight::NCHWToImage(
-    float *nchw, half_t *image, const DDim &tensor_dim) {
-  size_t new_dims[] = {1, 1, 1, 1};
-  for (int j = 0; j < tensor_dim.size(); ++j) {
-    new_dims[4 - tensor_dim.size() + j] = tensor_dim[j];
-  }
-
-  size_t N, C, H, W;
-  C = new_dims[0];
-  N = new_dims[1];
-  H = new_dims[2];
-  W = new_dims[3];
-
-  DDim in_image_dim = InitImageDimInfoWith(tensor_dim);
-
-  DLOG << " tensor dim " << tensor_dim;
-  DLOG << " image dim " << in_image_dim;
-
-  size_t width = in_image_dim[0];
-  size_t height = in_image_dim[1];
-
-  int w_block = width / W;
-
-  float *p = nchw;
-  int realC = w_block * 4;
-  for (int c = 0; c < realC; c++) {
-    for (int n = 0; n < N; n++) {
-      for (int h = 0; h < H; h++) {
-        for (int w = 0; w < W; w++) {
-          int index = (n * H + h) * width * 4 + (c / 4) * 4 * W + w * 4 + c % 4;
-          if (c < C) {
-            image[index] = Float2Half(*p);
-            p++;
-          } else {
-            image[index] = 0;
-          }
-        }
-      }
-    }
-  }
-}
-
-void CLImageConverterConv2dTransposeTransWeight::ImageToNCHW(
-    half_t *image, float *tensor, const DDim &image_dim,
-    const DDim &tensor_dim) {}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_image_converter.h b/mobile/src/framework/cl/cl_image_converter.h
deleted file mode 100644
index 75c135c042b14206b5e3fdb1c2028fff1732b096..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_image_converter.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/cl/cl_half.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLImageConverterBase {
- public:
-  virtual void NCHWToImage(float *nchw, half_t *image,
-                           const DDim &tensor_dim) = 0;
-
-  virtual void ImageToNCHW(half_t *image, float *nchw, const DDim &image_dim,
-                           const DDim &tensor_dim) = 0;
-  virtual DDim InitImageDimInfoWith(const DDim &tensor_dim) = 0;
-};
-
-class CLImageConverterDefault : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *nchw, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-class CLImageConverterFolder : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-
-  /*
-   *  width of original tensor
-   * */
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  /*
-   *  height of original tensor
-   * */
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
-  int GetCBlock() const { return c_block_; }
-
- private:
-  int c_block_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-};
-
-class CLImageConverterNormal : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-
-  /*
-   *  width of original tensor
-   * */
-  inline size_t WidthOfOneBlock() const { return width_of_one_block_; }
-
-  /*
-   *  height of original tensor
-   * */
-  inline size_t HeightOfOneBlock() const { return height_of_one_block_; }
-
-  int GetCBlock() const { return c_block_; }
-
- private:
-  int c_block_;
-  int width_of_one_block_;
-  int height_of_one_block_;
-};
-
-class CLImageConverterNWBlock : public CLImageConverterBase {
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-class CLImageConverterDWBlock : public CLImageConverterBase {
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-class CLImageConverterWinoTransWeight : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-class CLImageConverterConv2dTransposeTransWeight : public CLImageConverterBase {
- public:
-  DDim InitImageDimInfoWith(const DDim &tensor_dim);
-  void NCHWToImage(float *tensor, half_t *image, const DDim &tensor_dim);
-  void ImageToNCHW(half_t *image, float *tensor, const DDim &image_dim,
-                   const DDim &tensor_dim);
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h
deleted file mode 100644
index 49e705e5a0a7f401954bca9719bfdad4c7065081..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_scope.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "CL/cl.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_tool.h"
-
-namespace paddle_mobile {
-
-extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels;
-extern const std::map<std::string, std::vector<unsigned char>> opencl_headers;
-
-namespace framework {
-
-class CLScope {
- public:
-  CLScope() {}
-
-  cl_command_queue CommandQueue() {
-    return CLEngine::Instance()->getClCommandQueue();
-  }
-
-  std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
-      const std::string &kernel_name, const std::string &file_name,
-      const std::string &options) {
-    LOG(kLOG_DEBUG2) << " to get program " << file_name;
-    auto program = Program(file_name, kernel_name, options);
-    LOG(kLOG_DEBUG2) << " end get program ~ ";
-    LOG(kLOG_DEBUG2) << " to create kernel: " << kernel_name;
-    std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
-        clCreateKernel(program, kernel_name.c_str(), &status_));
-    CL_CHECK_ERRORS(status_);
-    LOG(kLOG_DEBUG2) << " end create kernel ~ ";
-    return std::move(kernel);
-  }
-
-  cl_context Context() { return CLEngine::Instance()->getContext(); }
-
-  cl_program Program(const std::string &file_name,
-                     const std::string &kernel_name,
-                     const std::string &options) {
-    if (opencl_kernels.find(kernel_name) != opencl_kernels.end() &&
-        opencl_headers.find(file_name) != opencl_headers.end()) {
-      std::string program_key = file_name + kernel_name;
-      if (!options.empty()) {
-        program_key += options;
-      }
-      auto it = programs_.find(program_key);
-      if (it != programs_.end()) {
-        return it->second.get();
-      }
-      auto src_it = opencl_kernels.find(kernel_name);
-      std::string source(src_it->second.begin(), src_it->second.end());
-      auto header_it = opencl_headers.find(file_name);
-      std::string header(header_it->second.begin(), header_it->second.end());
-      source = header + "\n" + source;
-      auto program = CLEngine::Instance()->CreateProgramWithSource(
-          CLEngine::Instance()->getContext(), source.c_str());
-
-      LOG(kLOG_DEBUG3) << " --- begin build program -> " << program_key
-                       << " --- ";
-      CLEngine::Instance()->BuildProgram(program.get(), options);
-      LOG(kLOG_DEBUG3) << " --- end build program -> " << program_key
-                       << " --- ";
-
-      programs_[program_key] = std::move(program);
-      return programs_[program_key].get();
-    } else {
-      std::string program_key = file_name;
-      if (!options.empty()) {
-        program_key += options;
-      }
-      auto it = programs_.find(program_key);
-      if (it != programs_.end()) {
-        return it->second.get();
-      }
-      auto program = CLEngine::Instance()->CreateProgramWith(
-          CLEngine::Instance()->getContext(),
-          CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
-
-      LOG(kLOG_DEBUG3) << " --- begin build program ele-> " << program_key
-                       << " --- ";
-      CLEngine::Instance()->BuildProgram(program.get(), options);
-      LOG(kLOG_DEBUG3) << " --- end build program ele-> " << program_key
-                       << " --- ";
-
-      programs_[program_key] = std::move(program);
-      return programs_[program_key].get();
-    }
-  }
-
-  CLLocalWorkSizeInfo LocalWorkSizeInfo() {
-    return CLEngine::Instance()->getLocalWorkSizeInfo();
-  }
-  size_t KernelWorkSize(cl_kernel kernel) {
-    size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel);
-    return kernel_work_size;
-  }
-
- private:
-  cl_int status_;
-  std::unordered_map<std::string,
-                     std::unique_ptr<_cl_program, CLProgramDeleter>>
-      programs_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_tensor.h b/mobile/src/framework/cl/cl_tensor.h
deleted file mode 100644
index 5bb4055eff43005bb51082e7516b37c3c1ff669c..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_tensor.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "CL/cl.h"
-#include "framework/cl/cl_deleter.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/tensor_base.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class CLTensor : public TensorBase {
- public:
-  CLTensor(cl_context context, cl_command_queue command_queue)
-      : context_(context), command_queue_(command_queue) {}
-
-  CLTensor() = default;
-
-  /*
-   * if init method haven't set context and command_queue, need set
-   * */
-  void SetContextAndCommandQueue(cl_context context,
-                                 cl_command_queue command_queue) {
-    context_ = context;
-    command_queue_ = command_queue;
-  }
-
-  /*! Resize the dimensions of the memory block. */
-  inline CLTensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-
-  template <typename T>
-  inline cl_mem mutable_with_data(const T *data) {
-    int64_t size = numel() * sizeof(T);
-
-    holder_.reset(new PlaceholderImpl(
-        size, reinterpret_cast<void *>(const_cast<T *>(data)),
-        type_id<T>().hash_code(), context_, command_queue_));
-    return reinterpret_cast<cl_mem>(holder_->ptr());
-  }
-
-  inline cl_mem mutable_data(kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type, context_, command_queue_));
-      offset_ = 0;
-    }
-    return reinterpret_cast<cl_mem>(holder_->ptr());
-  }
-
-  /**
-   * @brief   Return a pointer to cl buffer.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline cl_mem mutable_data() {
-    return reinterpret_cast<cl_mem>(mutable_data(type_id<T>().hash_code()));
-  }
-
-  /**
-   * @brief     Return a pointer to cl buffer.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline cl_mem mutable_data(DDim dims) {
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  inline cl_mem CLBuffer() {
-    check_memory_size();
-    return reinterpret_cast<cl_mem>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()));
-  }
-
-  template <typename T>
-  inline T *Data() {
-    if (host_ptr_) {
-      delete (host_ptr_);
-      host_ptr_ = nullptr;
-    }
-    cl_mem buffer = CLBuffer();
-    host_ptr_ = new char[holder_->size()];
-    cl_int status;
-    status = clEnqueueReadBuffer(command_queue_, buffer, CL_TRUE, 0,
-                                 holder_->size(), host_ptr_, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-    return reinterpret_cast<T *>(host_ptr_);
-  }
-
-  int memorySize() { return holder_->size(); }
-
-  ~CLTensor() {
-    DLOG << "~CLTensor";
-    if (host_ptr_) {
-      DLOG << " delete host ptr ";
-      delete (host_ptr_);
-      host_ptr_ = nullptr;
-    }
-  }
-
- private:
-  cl_context context_;
-  cl_command_queue command_queue_;
-  void *host_ptr_ = nullptr;
-
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t size, void *input, kTypeId_t type,
-                    cl_context context, cl_command_queue command_queue)
-        : ptr_(clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-                              size, reinterpret_cast<void *>(input), NULL)),
-          size_(size),
-          capatity_(size),
-          type_(type),
-          context_(context),
-          command_queue_(command_queue) {}
-
-    PlaceholderImpl(size_t size, kTypeId_t type, cl_context context,
-                    cl_command_queue command_queue)
-        : ptr_(clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL)),
-          size_(size),
-          capatity_(size),
-          type_(type),
-          context_(context),
-          command_queue_(command_queue) {}
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
-
-    virtual kTypeId_t type() const { return type_; }
-
-    virtual void set_type(kTypeId_t type) { type_ = type; }
-
-    virtual void resize(size_t size) {
-      if (size > capatity_) {
-        capatity_ = size;
-        ptr_.reset(
-            clCreateBuffer(context_, CL_MEM_READ_WRITE, capatity_, NULL, NULL));
-      }
-      size_ = size;
-    }
-
-    virtual void realloc(size_t size) {
-      capatity_ = size;
-      ptr_.reset(
-          clCreateBuffer(context_, CL_MEM_READ_WRITE, capatity_, NULL, NULL));
-      size_ = size;
-    }
-
-    std::unique_ptr<_cl_mem, CLMemDeleter> ptr_;
-
-    size_t size_;
-
-    size_t capatity_;
-
-    /* the current type of memory */
-    kTypeId_t type_;
-
-    cl_context context_;
-    cl_command_queue command_queue_;
-  };
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_tool.cpp b/mobile/src/framework/cl/cl_tool.cpp
deleted file mode 100644
index 827642b6b73cfaee02f4053dce798bf6b3c52f4b..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_tool.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/cl/cl_tool.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-const char *opencl_error_to_str(cl_int error) {
-#define CASE_CL_CONSTANT(NAME) \
-  case NAME:                   \
-    return #NAME;
-  // Suppose that no combinations are possible.
-  switch (error) {
-    CASE_CL_CONSTANT(CL_SUCCESS)
-    CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND)
-    CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE)
-    CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES)
-    CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY)
-    CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE)
-    CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP)
-    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH)
-    CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED)
-    CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE)
-    CASE_CL_CONSTANT(CL_MAP_FAILURE)
-    CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET)
-    CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST)
-    CASE_CL_CONSTANT(CL_INVALID_VALUE)
-    CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE)
-    CASE_CL_CONSTANT(CL_INVALID_PLATFORM)
-    CASE_CL_CONSTANT(CL_INVALID_DEVICE)
-    CASE_CL_CONSTANT(CL_INVALID_CONTEXT)
-    CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES)
-    CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE)
-    CASE_CL_CONSTANT(CL_INVALID_HOST_PTR)
-    CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT)
-    CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)
-    CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_SAMPLER)
-    CASE_CL_CONSTANT(CL_INVALID_BINARY)
-    CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS)
-    CASE_CL_CONSTANT(CL_INVALID_PROGRAM)
-    CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE)
-    CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET)
-    CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST)
-    CASE_CL_CONSTANT(CL_INVALID_EVENT)
-    CASE_CL_CONSTANT(CL_INVALID_OPERATION)
-    CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT)
-    CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL)
-    CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE)
-    CASE_CL_CONSTANT(CL_INVALID_PROPERTY)
-
-    default:
-      return "UNKNOWN ERROR CODE";
-  }
-#undef CASE_CL_CONSTANT
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_tool.h b/mobile/src/framework/cl/cl_tool.h
deleted file mode 100644
index ccc97779ece91b881312b031a92a6992ba5fed86..0000000000000000000000000000000000000000
--- a/mobile/src/framework/cl/cl_tool.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "CL/cl.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-const char* opencl_error_to_str(cl_int error);
-
-#define CL_CHECK_ERRORS(ERR)                                                  \
-  if (ERR != CL_SUCCESS) {                                                    \
-    printf(                                                                   \
-        "\033[1;31;40mOpenCL error with code %s happened in file %s at line " \
-        "%d. "                                                                \
-        "Exiting.\033[0m\n",                                                  \
-        paddle_mobile::framework::opencl_error_to_str(ERR), __FILE__,         \
-        __LINE__);                                                            \
-  }
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/context.cpp b/mobile/src/framework/context.cpp
deleted file mode 100644
index 10f1572d030c50a2efaaf58654573ee1a3c40b3a..0000000000000000000000000000000000000000
--- a/mobile/src/framework/context.cpp
+++ /dev/null
@@ -1,605 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
-
-#include "framework/context.h"
-#include <iostream>
-#include <string>
-#include "common/log.h"
-
-#ifdef __APPLE__
-#include "TargetConditionals.h"
-#ifdef TARGET_OS_IPHONE
-// iOS
-#elif TARGET_OS_MAC
-// Mac OS
-#else
-// Unsupported platform
-#endif
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#else  // Linux or Android
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
-const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
-const int DEFAULT_L3_CACHE_SIZE = 0;
-
-void fill_cpu_cache_size(std::vector<int> *cpu_cache_sizes, int value,
-                         const std::vector<int> cpu_ids = {}) {
-  int num = cpu_ids.size();
-  if (num > 0) {
-    for (int i = 0; i < num; i++) {
-      if (cpu_ids.size() > i) {
-        int idx = cpu_ids[i];
-        if (cpu_cache_sizes->size() > idx) {
-          (*cpu_cache_sizes)[idx] = value;
-        }
-      }
-    }
-  } else {
-    num = cpu_cache_sizes->size();
-    for (int i = 0; i < num; i++) {
-      if (cpu_cache_sizes->size() > i) {
-        (*cpu_cache_sizes)[i] = value;
-      }
-    }
-  }
-}
-
-int get_cpu_num() {
-#ifdef __APPLE__
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#else  // Linux or Android
-  // get cpu num from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_num = 20;
-  int count = 0;
-  for (int i = 0; i < max_cpu_num; i++) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE *fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#endif
-}
-
-#if !defined(__APPLE__)  // Linux or Android
-std::string get_cpu_name() {
-  FILE *fp = fopen("/proc/cpuinfo", "rb");
-  if (!fp) {
-    return "";
-  }
-  char line[1024];
-  while (!feof(fp)) {
-    char *s = fgets(line, 1024, fp);
-    if (!s) {
-      break;
-    }
-    if (strstr(line, "Hardware") != NULL) {
-      fclose(fp);
-      return std::string(line);
-    }
-  }
-  fclose(fp);
-  return "";
-}
-
-int get_cpu_max_freq_khz(int cpu_id) {
-  // first try, for all possible cpu
-  char path[256];
-#ifdef __ANDROID__
-  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
-  FILE *fp = fopen(path, "rb");
-  if (!fp) {
-    // second try, for online cpu
-    snprintf(path, sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpu_id);
-    fp = fopen(path, "rb");
-    if (!fp) {
-      // third try, for online cpu
-      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
-               cpu_id);
-      fp = fopen(path, "rb");
-      if (!fp) {
-        return 0;
-      }
-      int max_freq_khz = 0;
-      if (fscanf(fp, "%d", &max_freq_khz) <= 0) {
-        max_freq_khz = 0;
-      }
-      fclose(fp);
-      return max_freq_khz;
-    }
-  }
-  int max_freq_khz = 0;
-  while (!feof(fp)) {
-    int freq_khz = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
-    if (nscan != 1) {
-      break;
-    }
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
-    }
-  }
-  fclose(fp);
-  return max_freq_khz;
-#else
-  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_max_freq", cpu_id);
-  FILE *fp = fopen(path, "r");
-  if (!fp) {
-    return 0;
-  }
-  int max_freq_khz = 0;
-  if (fscanf(fp, "%d", &max_freq_khz) <= 0) {
-    max_freq_khz = 0;
-  }
-  fclose(fp);
-  return max_freq_khz;
-#endif
-}
-
-void get_cpu_cache_size(int cpu_id, int *l1_cache_size, int *l2_cache_size,
-                        int *l3_cache_size) {
-  int max_cache_idx_num = 10;
-  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
-  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
-  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
-  for (int i = 0; i < max_cache_idx_num; i++) {
-    char path[256];
-    snprintf(path, sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
-    FILE *fp = fopen(path, "rb");
-    if (fp) {
-      int level = -1;
-      fscanf(fp, "%d", &level);
-      fclose(fp);
-      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
-      fp = fopen(path, "rb");
-      if (fp) {
-        int size = -1;
-        fscanf(fp, "%d", &size);
-        fclose(fp);
-        if (size >= 0) {
-          if (level == 1) {
-            *l1_cache_size = size * 1024;
-          } else if (level == 2) {
-            *l2_cache_size = size * 1024;
-          } else if (level == 3) {
-            *l3_cache_size = size * 1024;
-          }
-        }
-      }
-    }
-  }
-}
-
-int check_online(std::vector<int> *cpu_ids) {
-  if (cpu_ids->size() == 0) {
-    return 0;
-  }
-  std::vector<int> online_cpu_ids;
-  char path[256];
-  for (int i = 0; i < cpu_ids->size(); i++) {
-    int cpu_id = (*cpu_ids)[i];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
-             cpu_id);
-    FILE *fp = fopen(path, "rb");
-    if (fp) {
-      int is_online = 0;
-      fscanf(fp, "%d", &is_online);
-      fclose(fp);
-      if (is_online != 0) {
-        online_cpu_ids.push_back(cpu_id);
-      }
-    }
-    // open failed(Permission denied)
-  }
-  *cpu_ids = online_cpu_ids;
-  return cpu_ids->size();
-}
-
-int set_sched_affinity(const std::vector<int> &cpu_ids) {
-// cpu_set_t definition
-// ref http://stackoverflow.com/questions/16319725/android-set-thread-affinity
-#define CPU_SETSIZE 1024
-#define __NCPUBITS (8 * sizeof(unsigned long))  // NOLINT
-  typedef struct {
-    unsigned long __bits[CPU_SETSIZE / __NCPUBITS];  // NOLINT
-  } cpu_set_t;
-
-#define CPU_SET(cpu, cpusetp) \
-  ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
-
-#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
-
-  // set affinity for thread
-#ifdef __GLIBC__
-  pid_t pid = syscall(SYS_gettid);
-#else
-  pid_t pid = gettid();
-#endif
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  for (int i = 0; i < cpu_ids.size(); i++) {
-    CPU_SET(cpu_ids[i], &mask);
-  }
-  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
-  if (syscallret) {
-    LOG(kLOG_WARNING) << "invoke syscall(__NR_sched_setaffinity) error(ret="
-                      << syscallret << ")";
-    return -1;
-  }
-  return 0;
-}
-
-int get_cpu_info_by_name(int *cpu_num, ARMArch *arch,
-                         std::vector<int> *big_core_ids,
-                         std::vector<int> *little_core_ids,
-                         std::vector<int> *l1_cache_sizes,
-                         std::vector<int> *l2_cache_sizes,
-                         std::vector<int> *l3_cache_sizes,
-                         std::string hardware_name) {
-  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
-    *cpu_num = 8;
-    *arch = A75;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024);
-    fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids);
-    fill_cpu_cache_size(l3_cache_sizes, 2048 * 1024);
-    return 0;
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
-    *cpu_num = 8;
-    *arch = A75;
-    *big_core_ids = {6, 7};
-    *little_core_ids = {0, 1, 2, 3, 4, 5};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 256 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 128 * 1024, *little_core_ids);
-    fill_cpu_cache_size(l3_cache_sizes, 1024 * 1024);
-    return 0;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024, *big_core_ids);
-    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024, *little_core_ids);
-    // real L2 cache size is 2M, while that will get bad performace on conv3x3s1
-    // or gemm, set to 1M or 512K
-    // fill_cpu_cache_size(l2_cache_sizes, 2048 *1024,
-    // *big_core_ids);
-    // fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024,
-    // *little_core_ids);
-    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
-    fill_cpu_cache_size(l3_cache_sizes, 0);
-    return 0;
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
-    *cpu_num = 8;
-    *arch = A72;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 32 * 1024);
-    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
-    fill_cpu_cache_size(l3_cache_sizes, 0);
-    return 0;
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    l1_cache_sizes->resize(*cpu_num);
-    l2_cache_sizes->resize(*cpu_num);
-    l3_cache_sizes->resize(*cpu_num);
-    fill_cpu_cache_size(l1_cache_sizes, 64 * 1024);
-    fill_cpu_cache_size(l2_cache_sizes, 1024 * 1024);
-    fill_cpu_cache_size(l3_cache_sizes, 0);
-    return 0;
-
-    /* MediaTek */
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
-    *cpu_num = 10;
-    *arch = A73;
-    *big_core_ids = {8, 9};
-    *little_core_ids = {0, 1, 2, 3, 4, 5, 6, 7};
-    return 0;
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    return 0;
-
-    /* Kirin */
-  } else if (hardware_name.find("KIRIN970") !=
-             std::string::npos) {  // Kirin 970
-    *cpu_num = 8;
-    *arch = A73;
-    *big_core_ids = {4, 5, 6, 7};
-    *little_core_ids = {0, 1, 2, 3};
-    return 0;
-  }
-  return -1;
-}
-
-// divide cpu cores into big and little clusters by max frequency
-void get_cpu_info_by_probe(int cpu_num, std::vector<int> *big_core_ids,
-                           std::vector<int> *little_core_ids,
-                           std::vector<int> *l1_cache_sizes,
-                           std::vector<int> *l2_cache_sizes,
-                           std::vector<int> *l3_cache_sizes) {
-  // get maxium & minium of cpu_max_freqs
-  std::vector<int> cpu_max_freqs(cpu_num);
-  for (int i = 0; i < cpu_num; i++) {
-    cpu_max_freqs[i] = get_cpu_max_freq_khz(i) / 1000;
-  }
-  int max_cpu_max_freq = cpu_max_freqs[0];
-  int min_cpu_max_freq = cpu_max_freqs[0];
-  for (int i = 1; i < cpu_num; i++) {
-    int cur_cpu_max_freq = cpu_max_freqs[i];
-    if (cur_cpu_max_freq < min_cpu_max_freq) {
-      min_cpu_max_freq = cur_cpu_max_freq;
-    } else if (cur_cpu_max_freq > max_cpu_max_freq) {
-      max_cpu_max_freq = cur_cpu_max_freq;
-    }
-  }
-  int mid_max_freq_khz = (max_cpu_max_freq + min_cpu_max_freq) / 2;
-  big_core_ids->clear();
-  little_core_ids->clear();
-  for (int i = 0; i < cpu_num; i++) {
-    if (cpu_max_freqs[i] >= mid_max_freq_khz) {
-      big_core_ids->push_back(i);
-    } else {
-      little_core_ids->push_back(i);
-    }
-  }
-  /* get l1, l2, l3 cache size for each core */
-  l1_cache_sizes->resize(cpu_num);
-  l2_cache_sizes->resize(cpu_num);
-  l3_cache_sizes->resize(cpu_num);
-  for (int i = 0; i < cpu_num; i++) {
-    get_cpu_cache_size(i, &((*l1_cache_sizes)[i]), &((*l2_cache_sizes)[i]),
-                       &((*l3_cache_sizes)[i]));
-  }
-}
-
-void bind_threads(const std::vector<int> &cpu_ids) {
-#ifdef _OPENMP
-  int num_threads = omp_get_max_threads();
-  std::vector<int> ssarets;
-  for (int i = 0; i < num_threads; i++) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(cpu_ids);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: " << i;
-      return;
-    }
-  }
-#else
-  int ssaret = set_sched_affinity(cpu_ids);
-  if (ssaret != 0) {
-    LOG(kLOG_WARNING) << "set cpu affinity failed, thread idx: 0 ";
-    return;
-  }
-#endif
-}
-#endif
-
-CPUContext::CPUContext() {
-  _cpu_num = get_cpu_num();
-  _big_core_ids.clear();
-  _little_core_ids.clear();
-#ifdef __APPLE__
-  // set default L1, L2 and L3 cache sizes
-  _l1_cache_sizes.resize(_cpu_num);
-  _l2_cache_sizes.resize(_cpu_num);
-  _l3_cache_sizes.resize(_cpu_num);
-  fill_cpu_cache_size(&_l1_cache_sizes, DEFAULT_L1_CACHE_SIZE);
-  fill_cpu_cache_size(&_l2_cache_sizes, DEFAULT_L2_CACHE_SIZE);
-  fill_cpu_cache_size(&_l3_cache_sizes, DEFAULT_L3_CACHE_SIZE);
-#else  // Linux or Android
-  // probe cpu info, and set big&litte clusters, L1, L2 and L3 cache sizes
-  std::string cpu_name = get_cpu_name();
-  bool failed =
-      get_cpu_info_by_name(&_cpu_num, &_arch, &_big_core_ids, &_little_core_ids,
-                           &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes,
-                           cpu_name) != 0;
-  if (failed) {
-    get_cpu_info_by_probe(_cpu_num, &_big_core_ids, &_little_core_ids,
-                          &_l1_cache_sizes, &_l2_cache_sizes, &_l3_cache_sizes);
-  }
-  LOG(kLOG_INFO) << "CPU num: " << _cpu_num;
-  for (int i = 0; i < _cpu_num; i++) {
-    if (!(_l1_cache_sizes.size() > i && _l2_cache_sizes.size() > i &&
-          _l3_cache_sizes.size() > i)) {
-      break;
-    }
-    LOG(kLOG_INFO) << i << " L1 Cache: " << _l1_cache_sizes[i] << "KB"
-                   << " L2 Cache: " << _l2_cache_sizes[i] << "KB"
-                   << " L3 Cache: " << _l3_cache_sizes[i] << "KB";
-  }
-  LOG(kLOG_INFO) << "Big cores: ";
-  for (int i = 0; i < _big_core_ids.size(); i++) {
-    LOG(kLOG_INFO) << _big_core_ids[i];
-  }
-  LOG(kLOG_INFO) << "Little cores: ";
-  for (int i = 0; i < _little_core_ids.size(); i++) {
-    LOG(kLOG_INFO) << _little_core_ids[i];
-  }
-#endif
-  // use single thread by default
-  set_thread_num(1, PERFORMANCE_PRIORITY);
-}
-
-void CPUContext::set_thread_num(int thread_num, PowerMode power_mode) {
-  int big_core_num = _big_core_ids.size();
-  int little_core_num = _little_core_ids.size();
-#ifdef _OPENMP
-  if (thread_num > _cpu_num) {
-    thread_num = _cpu_num;
-  }
-#else
-  thread_num = 1;
-#endif
-  std::vector<int> bind_core_ids;
-  if (power_mode == PERFORMANCE_PRIORITY || power_mode == PERFORMANCE_ONLY) {
-    if (big_core_num > 0) {
-      bind_core_ids = _big_core_ids;
-      if (power_mode == PERFORMANCE_ONLY && thread_num > big_core_num) {
-        LOG(kLOG_ERROR) << "thread_num(" << thread_num
-                        << ") exceed the big cores num (" << big_core_num << ")"
-                        << ", force to set thread_num = " << big_core_num;
-        thread_num = big_core_num;
-      }
-    }
-  } else if (power_mode == EFFICIENCY_PRIORITY ||
-             power_mode == EFFICIENCY_ONLY) {
-    if (little_core_num > 0) {
-      bind_core_ids = _little_core_ids;
-      if (power_mode == EFFICIENCY_ONLY && thread_num > little_core_num) {
-        LOG(kLOG_ERROR) << "thread_num(" << thread_num
-                        << ") exceed the little cores num (" << little_core_num
-                        << ")"
-                        << ", force to set thread_num = " << little_core_num;
-        thread_num = little_core_num;
-      }
-    }
-  }
-  _power_mode = AUTO;
-#ifdef _OPENMP
-  omp_set_num_threads(thread_num);
-  thread_num = omp_get_max_threads();
-#endif
-#if !defined(__APPLE__)  // Linux or Android
-  if (bind_core_ids.size() > 0 && check_online(&bind_core_ids) >= thread_num) {
-    bind_threads(bind_core_ids);
-    _power_mode = power_mode;
-  }
-#endif
-  LOG(kLOG_INFO) << "thread num: " << thread_num
-                 << " power mode: " << _power_mode;
-}
-
-int CPUContext::get_thread_num() {
-  int thread_num = 1;
-#ifdef _OPENMP
-  thread_num = omp_get_max_threads();
-#endif
-  return thread_num;
-}
-
-int CPUContext::get_cache_size(int level) {
-  std::vector<int> *ptr = nullptr;
-  if (level == 1) {
-    ptr = &_l1_cache_sizes;
-  } else if (level == 2) {
-    ptr = &_l2_cache_sizes;
-  } else if (level == 3) {
-    ptr = &_l3_cache_sizes;
-  } else {
-    return 0;
-  }
-  if (_power_mode == PERFORMANCE_PRIORITY || _power_mode == PERFORMANCE_ONLY) {
-    if (_big_core_ids.size() > 0) {
-      int idx = _big_core_ids[0];
-      if (ptr->size() > idx) {
-        return (*ptr)[idx];
-      }
-    }
-  } else if (_power_mode == EFFICIENCY_PRIORITY ||
-             _power_mode == EFFICIENCY_ONLY) {
-    if (_little_core_ids.size() > 0) {
-      int idx = _little_core_ids[0];
-      if (ptr->size() > idx) {
-        return (*ptr)[idx];
-      }
-    }
-  } else {  // AUTO
-    int idx = 0;
-    if (ptr->size() > idx) {
-      return (*ptr)[idx];
-    }
-  }
-}
-
-void *CPUContext::get_work_space(int size_in_byte) {
-  return reinterpret_cast<void *>(
-      _workspace.mutable_data<int8_t>(make_ddim({size_in_byte})));
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h
deleted file mode 100644
index 18e40311bc2a5d555bb02cf0eb7af6356cbbf0b0..0000000000000000000000000000000000000000
--- a/mobile/src/framework/context.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Tencent is pleased to support the open source community by making ncnn
-// available.
-//
-// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this
-// file except in compliance with the License. You may obtain a copy of the
-// License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-// License for the specific language governing permissions and limitations under
-// the License.
-
-#pragma once
-
-#if _OPENMP
-#include <omp.h>
-#endif
-
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct CPUContext {
- private:
-  CPUContext();
-
- public:
-  ~CPUContext() {}
-
-  static CPUContext* Context() {
-    static CPUContext ctx;
-    return &ctx;
-  }
-
-  void set_thread_num(int thread_num,
-                      PowerMode power_mode = PERFORMANCE_PRIORITY);
-  int get_thread_num();
-  PowerMode get_power_mode() const { return _power_mode; }
-  int get_cache_size(int level);
-  ARMArch get_arch() const { return _arch; }
-  int get_l1_cache_size() { return get_cache_size(1); }
-  int get_l2_cache_size() { return get_cache_size(2); }
-  int get_l3_cache_size() { return get_cache_size(3); }
-  void* get_work_space(int size_in_byte);
-
-  int _cpu_num;
-  ARMArch _arch;
-  PowerMode _power_mode;
-  std::vector<int> _big_core_ids;
-  std::vector<int> _little_core_ids;
-  std::vector<int> _l1_cache_sizes;
-  std::vector<int> _l2_cache_sizes;
-  std::vector<int> _l3_cache_sizes;
-  Tensor _workspace;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/data_layout.h b/mobile/src/framework/data_layout.h
deleted file mode 100644
index fd0bec39132e04cc0b5ef6b30ec48b106c79b534..0000000000000000000000000000000000000000
--- a/mobile/src/framework/data_layout.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cctype>
-#include <cstdlib>
-#include <string>
-
-namespace paddle_mobile {
-namespace framework {
-
-enum class DataLayout {
-  kNHWC = 0,
-  kNCHW = 1,
-  kAnyLayout = 2,
-};
-
-inline DataLayout StringToDataLayout(const std::string &str) {
-  std::string s(str);
-  for (size_t i = 0; i < s.size(); ++i) {
-    s[i] = toupper(s[i]);
-  }
-
-  if (s == "NHWC") {
-    return DataLayout::kNHWC;
-  } else if (s == "NCHW") {
-    return DataLayout::kNCHW;
-  } else if (s == "ANYLAYOUT") {
-    return DataLayout::kAnyLayout;
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string: %s", s.c_str())
-  }
-  return DataLayout::kNCHW;
-}
-
-inline std::string DataLayoutToString(const DataLayout &data_layout) {
-  switch (data_layout) {
-    case DataLayout::kNHWC:
-      return "NHWC";
-    case DataLayout::kNCHW:
-      return "NCHW";
-    case DataLayout::kAnyLayout:
-      return "ANY_LAYOUT";
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Unknown storage order string ")
-      break;
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/data_type.cpp b/mobile/src/framework/data_type.cpp
deleted file mode 100644
index 5eaf3ecaf599d5ed34c5396bbba2c9e0e68736fd..0000000000000000000000000000000000000000
--- a/mobile/src/framework/data_type.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/data_type.h"
-#include <stdint.h>
-#include <string>
-#include <unordered_map>
-#include "common/type_define.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct DataTypeMap {
-  std::unordered_map<kTypeId_t, _PaddleMobile__Framework__Proto__VarType__Type>
-      cpp_to_proto_;
-  std::unordered_map<int, kTypeId_t> proto_to_cpp_;
-  std::unordered_map<int, std::string> proto_to_str_;
-  std::unordered_map<kTypeId_t, size_t> cpp_to_size_;
-};
-
-static DataTypeMap* InitDataTypeMap();
-// C++11 removes the need for manual locking. Concurrent execution shall wait if
-// a static local variable is already being initialized.
-// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex
-static DataTypeMap& gDataTypeMap() {
-  static DataTypeMap* g_data_type_map_ = InitDataTypeMap();
-  return *g_data_type_map_;
-}
-
-template <typename T>
-static inline void RegisterType(
-    DataTypeMap* map, _PaddleMobile__Framework__Proto__VarType__Type proto_type,
-    const std::string& name) {
-  map->proto_to_cpp_.emplace(static_cast<int>(proto_type),
-                             type_id<T>().hash_code());
-  map->cpp_to_proto_.emplace(type_id<T>().hash_code(), proto_type);
-  map->proto_to_str_.emplace(static_cast<int>(proto_type), name);
-  map->cpp_to_size_.emplace(type_id<T>().hash_code(), sizeof(T));
-}
-
-static DataTypeMap* InitDataTypeMap() {
-  auto retv = new DataTypeMap();
-
-#define RegType(cc_type, proto_type) \
-  RegisterType<cc_type>(retv, proto_type, #cc_type)
-
-  // NOTE: Add your customize type here.
-  // RegType(float16, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16);
-  RegType(float, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32);
-  RegType(double, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64);
-  RegType(int, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32);
-  RegType(int64_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64);
-  RegType(bool, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL);
-  RegType(size_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T);
-  RegType(int16_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16);
-  RegType(uint8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8);
-  RegType(int8_t, PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8);
-
-#undef RegType
-  return retv;
-}
-
-_PaddleMobile__Framework__Proto__VarType__Type ToDataType(kTypeId_t type) {
-  auto it = gDataTypeMap().cpp_to_proto_.find(type);
-  if (it != gDataTypeMap().cpp_to_proto_.end()) {
-    return it->second;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION("Not support %d as tensor type", type);
-}
-
-kTypeId_t ToTypeIndex(_PaddleMobile__Framework__Proto__VarType__Type type) {
-  auto it = gDataTypeMap().proto_to_cpp_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_cpp_.end()) {
-    return it->second;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
-      "tensor type",
-      static_cast<int>(type));
-}
-
-std::string DataTypeToString(
-    const _PaddleMobile__Framework__Proto__VarType__Type type) {
-  auto it = gDataTypeMap().proto_to_str_.find(static_cast<int>(type));
-  if (it != gDataTypeMap().proto_to_str_.end()) {
-    return it->second;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION(
-      "Not support _PaddleMobile__Framework__Proto__VarType__Type(%d) as "
-      "tensor type",
-      static_cast<int>(type));
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/data_type.h b/mobile/src/framework/data_type.h
deleted file mode 100644
index bda823ada49b108e1f3279173f87daefc3403b5b..0000000000000000000000000000000000000000
--- a/mobile/src/framework/data_type.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-
-_PaddleMobile__Framework__Proto__VarType__Type ToDataType(kTypeId_t type);
-
-kTypeId_t ToTypeIndex(_PaddleMobile__Framework__Proto__VarType__Type type);
-
-inline _PaddleMobile__Framework__Proto__VarType__Type ToDataType(int type) {
-  return static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(type);
-}
-
-template <typename Visitor>
-inline void VisitDataType(_PaddleMobile__Framework__Proto__VarType__Type type,
-                          Visitor visitor) {
-  switch (type) {
-    // case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16:
-    //   visitor.template apply<float16>();
-    //   break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32:
-      visitor.template apply<float>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64:
-      visitor.template apply<double>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32:
-      visitor.template apply<int>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64:
-      visitor.template apply<int64_t>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL:
-      visitor.template apply<bool>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8:
-      visitor.template apply<uint8_t>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16:
-      visitor.template apply<int16_t>();
-      break;
-    case PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8:
-      visitor.template apply<int8_t>();
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Not supported %d", type);
-  }
-}
-
-extern std::string DataTypeToString(
-    const _PaddleMobile__Framework__Proto__VarType__Type type);
-inline std::ostream& operator<<(
-    std::ostream& out,
-    const _PaddleMobile__Framework__Proto__VarType__Type& type) {
-  out << DataTypeToString(type);
-  return out;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/ddim.cpp b/mobile/src/framework/ddim.cpp
deleted file mode 100644
index 4f68caad77c60e8f4a2312291e6600290860b102..0000000000000000000000000000000000000000
--- a/mobile/src/framework/ddim.cpp
+++ /dev/null
@@ -1,327 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ddim.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace framework {
-
-/// @cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int64_t *d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<0> make_dim<0>(const int64_t *d) {
-  return Dim<0>(0);
-}
-
-void make_ddim(DDim &ddim, const int64_t *dims, int n) {
-  switch (n) {
-    case 0:
-      ddim = make_dim<0>(dims);
-      break;
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      break;
-  }
-}
-
-/// @endcond
-
-DDim make_ddim(std::initializer_list<int64_t> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int64_t> &dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int> &dims) {
-  std::vector<int64_t> res(dims.size());
-  std::transform(dims.begin(), dims.end(), res.begin(),
-                 [](int d) { return static_cast<int64_t>(d); });
-  return make_ddim(res);
-}
-
-/// @cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes
-// errors
-struct DynamicMutableIndexer : Vistor<int64_t &> {
- public:
-  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t &operator()(Dim<D> &dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-struct DynamicConstIndexer : public Vistor<int64_t> {
- public:
-  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int64_t operator()(const Dim<D> &dim) const {
-    return dim[idx_];
-  }
-
- private:
-  int idx_;
-};
-
-/// @endcond
-
-int64_t &DDim::operator[](int idx) {
-  return DDim::ApplyVistor(DynamicMutableIndexer(idx), *this);
-}
-
-int64_t DDim::operator[](int idx) const {
-  return DDim::ApplyVistor(DynamicConstIndexer(idx), *this);
-}
-
-int DDim::size() const { return arity(*this); }
-
-bool DDim::operator==(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  if (v1.size() != v2.size()) {
-    return false;
-  }
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    if (v1[i] != v2[i]) {
-      return false;
-    }
-  }
-
-  return true;
-  //  }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() != v2.size()");
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-DDim DDim::operator*(DDim d) const {
-  std::vector<int64_t> v1 = vectorize(*this);
-  std::vector<int64_t> v2 = vectorize(d);
-
-  std::vector<int64_t> v3;
-
-  PADDLE_MOBILE_ENFORCE(v1.size() == v2.size(), "v1.size() == v2.size()");
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-int64_t get(const DDim &ddim, int idx) { return ddim[idx]; }
-
-void set(DDim *ddim, int idx, int value) { (*ddim)[idx] = value; }
-
-/// @cond HIDDEN
-struct VectorizeVisitor : Vistor<void> {
-  std::vector<int64_t> &vector;
-
-  explicit VectorizeVisitor(std::vector<int64_t> &v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T &t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<0> &t) {}
-};
-/// @endcond
-
-std::vector<int64_t> vectorize(const DDim &ddim) {
-  std::vector<int64_t> result;
-  VectorizeVisitor visitor(result);
-  DDim::ApplyVistor(visitor, ddim);
-  return result;
-}
-
-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> vectorize2int(const DDim &ddim) {
-  std::vector<int64_t> temp = vectorize(ddim);
-  std::vector<int> result(temp.begin(), temp.end());
-  return result;
-}
-
-struct ProductVisitor : Vistor<int64_t> {
-  template <int D>
-  int64_t operator()(const Dim<D> &dim) {
-    return product(dim);
-  }
-};
-
-int64_t product(const DDim &ddim) {
-  ProductVisitor visitor;
-  return DDim::ApplyVistor(visitor, ddim);
-}
-
-struct SliceVectorizeVisitor : Vistor<void> {
-  std::vector<int64_t> &vector;
-  int begin;
-  int end;
-
-  SliceVectorizeVisitor(std::vector<int64_t> &v, int b, int e)
-      : vector(v), begin(b), end(e) {
-    PADDLE_MOBILE_ENFORCE(
-        begin < end, "Begin index must be less than end index in ddim slice.");
-    PADDLE_MOBILE_ENFORCE(begin >= 0,
-                          "Begin index can't be less than zero in ddim slice.");
-  }
-
-  template <int S>
-  void operator()(const Dim<S> &dim) {
-    if (begin == 0) {
-      vector.push_back(dim.head);
-    } else {
-      --begin;
-    }
-    --end;
-    if (end > 0) {
-      this->operator()(dim.tail);
-    }
-  }
-
-  void operator()(const Dim<0> &dim) {
-    //    PADDLE_ENFORCE(end == 0, "End index in ddim slice is out
-    //    of bound.");
-  }
-};
-
-DDim slice_ddim(const DDim &ddim, int begin, int end) {
-  std::vector<int64_t> vec;
-  vec.reserve(end - begin);
-  SliceVectorizeVisitor visitor(vec, begin, end);
-  DDim::ApplyVistor(visitor, ddim);
-  return make_ddim(vec);
-}
-
-/// \cond HIDDEN
-
-struct ArityVisitor : Vistor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-/// \endcond
-
-int arity(const DDim &d) {
-  ArityVisitor arityVisitor = ArityVisitor();
-  return DDim::ApplyVistor(arityVisitor, d);
-}
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const DDim &ddim) {
-  for (int j = 0; j < ddim.size(); ++j) {
-    printer << ddim[j] << " ";
-  }
-
-  return printer;
-}
-
-#endif
-
-DDim::DDim(std::initializer_list<int64_t> init_list) {
-  *this = make_ddim(init_list);
-}
-
-DDim flatten_to_2d(const DDim &src, int num_col_dims) {
-  int rank = src.size();
-  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
-                    product(slice_ddim(src, num_col_dims, rank))});
-}
-
-DDim flatten_to_1d(const DDim &src) { return make_ddim({product(src)}); }
-
-DDim stride(const DDim &ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = 1;
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i + 1];
-  }
-  return framework::make_ddim(strides);
-}
-
-DDim stride_numel(const framework::DDim &ddim) {
-  std::vector<int64_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return framework::make_ddim(strides);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/ddim.h b/mobile/src/framework/ddim.h
deleted file mode 100644
index 5d3844be7893a5e8ee51b23e1a2949ce2ba302c1..0000000000000000000000000000000000000000
--- a/mobile/src/framework/ddim.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <initializer_list>
-#include <string>
-#include <typeinfo>
-#include <vector>
-
-#include "common/enforce.h"
-#include "common/variant.h"
-#include "framework/dim.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-struct DDim {
-  typedef Variant<Dim<0>, Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>,
-                  Dim<7>, Dim<8>, Dim<9>>
-      DDimVar;
-  DDimVar var;
-
-  template <typename Vistor>
-  static typename Vistor::type_t ApplyVistor(Vistor vistor, const DDim &d) {
-    if (d.var.TypeId() == type_id<Dim<0>>()) {
-      return vistor(d.var.Get<Dim<0>>());
-    } else if (d.var.TypeId() == type_id<Dim<1>>()) {
-      return vistor(d.var.Get<Dim<1>>());
-    } else if (d.var.TypeId() == type_id<Dim<2>>()) {
-      return vistor(d.var.Get<Dim<2>>());
-    } else if (d.var.TypeId() == type_id<Dim<3>>()) {
-      return vistor(d.var.Get<Dim<3>>());
-    } else if (d.var.TypeId() == type_id<Dim<4>>()) {
-      return vistor(d.var.Get<Dim<4>>());
-    } else if (d.var.TypeId() == type_id<Dim<5>>()) {
-      return vistor(d.var.Get<Dim<5>>());
-    } else if (d.var.TypeId() == type_id<Dim<6>>()) {
-      return vistor(d.var.Get<Dim<6>>());
-    } else if (d.var.TypeId() == type_id<Dim<7>>()) {
-      return vistor(d.var.Get<Dim<7>>());
-    } else if (d.var.TypeId() == type_id<Dim<8>>()) {
-      return vistor(d.var.Get<Dim<8>>());
-    } else if (d.var.TypeId() == type_id<Dim<9>>()) {
-      return vistor(d.var.Get<Dim<9>>());
-    } else {
-      PADDLE_MOBILE_ENFORCE(false, " dim not support");
-    }
-  }
-
-  DDim() { var.Set<Dim<1>>(Dim<1>()); }
-
-  template <int D>
-  explicit DDim(const Dim<D> &in) {
-    var.Set<Dim<D>>(in);
-  }
-
-  DDim(const DDim &in) { setNewDim(in); }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
-
-  template <int D>
-  DDim &operator=(const Dim<D> &in) {
-    var.Set<Dim<D>>(in);
-    return *this;
-  }
-
-  DDim &operator=(const DDim &in) {
-    setNewDim(in);
-    return *this;
-  }
-
-  void setNewDim(const DDim &d) {
-    if (d.var.TypeId() == type_id<Dim<0>>()) {
-      return var.Set<Dim<0>>(d.var.Get<Dim<0>>());
-    } else if (d.var.TypeId() == type_id<Dim<1>>()) {
-      return var.Set<Dim<1>>(d.var.Get<Dim<1>>());
-    } else if (d.var.TypeId() == type_id<Dim<2>>()) {
-      return var.Set<Dim<2>>(d.var.Get<Dim<2>>());
-    } else if (d.var.TypeId() == type_id<Dim<3>>()) {
-      return var.Set<Dim<3>>(d.var.Get<Dim<3>>());
-    } else if (d.var.TypeId() == type_id<Dim<4>>()) {
-      return var.Set<Dim<4>>(d.var.Get<Dim<4>>());
-    } else if (d.var.TypeId() == type_id<Dim<5>>()) {
-      return var.Set<Dim<5>>(d.var.Get<Dim<5>>());
-    } else if (d.var.TypeId() == type_id<Dim<6>>()) {
-      return var.Set<Dim<6>>(d.var.Get<Dim<6>>());
-    } else if (d.var.TypeId() == type_id<Dim<7>>()) {
-      return var.Set<Dim<7>>(d.var.Get<Dim<7>>());
-    } else if (d.var.TypeId() == type_id<Dim<8>>()) {
-      return var.Set<Dim<8>>(d.var.Get<Dim<8>>());
-    } else if (d.var.TypeId() == type_id<Dim<9>>()) {
-      return var.Set<Dim<9>>(d.var.Get<Dim<9>>());
-    } else {
-      PADDLE_MOBILE_ENFORCE(false, " dim not support");
-    }
-  }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  DDimVar getVar() const { return var; }
-
-  bool operator==(DDim d) const;
-
-  bool operator!=(DDim d) const;
-
-  DDim operator+(DDim d) const;
-
-  DDim operator*(DDim d) const;
-
-  int size() const;
-};
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int64_t> &dims);
-
-DDim make_ddim(const std::vector<int> &dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int64_t> dims);
-
-int64_t get(const DDim &dim, int idx);
-
-void set(DDim *dim, int idx, int val);
-
-std::vector<int64_t> vectorize(const DDim &ddim);
-
-std::vector<int> vectorize2int(const DDim &ddim);
-
-int64_t product(const DDim &ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim &dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim &ddim);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column
-// length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim &src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim &src);
-
-DDim stride(const DDim &ddim);
-
-DDim stride_numel(const DDim &ddim);
-
-#ifdef PADDLE_MOBILE_DEBUG
-Print &operator<<(Print &printer, const DDim &ddim);
-#endif
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/dim.h b/mobile/src/framework/dim.h
deleted file mode 100644
index e11d6fe39abf4b6f03c58a0b0ee1d4d3c442642b..0000000000000000000000000000000000000000
--- a/mobile/src/framework/dim.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdlib>
-#include <string>
-#include "common/enforce.h"
-namespace paddle_mobile {
-namespace framework {
-
-// Statically sized, statically indexed dimension
-template <int i>
-struct Dim {
-  static constexpr int dimensions = i;
-
-  template <typename... Args>
-  Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
-    static_assert(sizeof...(_tail) == i - 1,
-                  "Dim initialized with the wrong number of parameters");
-  }
-
-  Dim(int64_t _head, const Dim<i - 1> &_tail) : head(_head), tail(_tail) {}
-
-  Dim() : head(0), tail() {}
-
-  /** Construct a Dim from a linear index and size.  Uses Fortran
-   * order
-   * indexing. */
-  Dim(int64_t idx, const Dim<i> &size)
-      : head(idx % size.head), tail(idx / size.head, size.tail) {}
-
-  /** Construct a Dim with each dimension set to the given index */
-  explicit Dim(int64_t idx) : head(idx), tail(idx) {}
-
-  bool operator==(const Dim<i> &o) const {
-    return (head == o.head) && (tail == o.tail);
-  }
-
-  bool operator!=(const Dim<i> &o) const { return !(*this == o); }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  std::string to_string() const;
-
-  int64_t head;
-  Dim<i - 1> tail;
-};
-
-// Base case specialization
-template <>
-struct Dim<0> {
-  static constexpr int dimensions = 0;
-
-  explicit Dim(int64_t _head) {}
-
-  Dim() {}
-
-  Dim(int idx, const Dim<0> &size) {
-    if (idx > 0) {
-      PADDLE_MOBILE_THROW_EXCEPTION("Index out of range.")
-    }
-  }
-
-  bool operator==(const Dim<0> &o) const { return true; }
-
-  bool operator!=(const Dim<0> &o) const { return false; }
-
-  int64_t &operator[](int idx);
-
-  int64_t operator[](int idx) const;
-
-  int64_t head;
-};
-
-namespace {
-
-// Helper for accessing Dim classes
-template <int i>
-struct DimGetter {
-  // Return a copy if Dim is const
-  template <typename D>
-  static int64_t impl(const D &d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  static int64_t &impl(D &d) {
-    return DimGetter<i - 1>::impl(d.tail);
-  }
-};
-
-// Eureka! We found the element!
-template <>
-struct DimGetter<0> {
-  // Return a copy if Dim is const
-  template <typename D>
-  static int64_t impl(const D &d) {
-    return d.head;
-  }
-  // Return a reference if Dim is mutable
-  template <typename D>
-  static int64_t &impl(D &d) {
-    return d.head;
-  }
-};
-
-template <int D>
-int64_t &indexer(Dim<D> &dim, int idx) {
-  if (idx < 0) {
-    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
-  }
-
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-int64_t &indexer<0>(Dim<0> &dim, int idx) {
-  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  return dim.head;
-}
-
-template <int D>
-int64_t indexer(const Dim<D> &dim, int idx) {
-  if (idx < 0) {
-    PADDLE_MOBILE_THROW_EXCEPTION("Tried to access a negative dimension")
-  }
-  if (idx == 0) {
-    return dim.head;
-  }
-  return indexer(dim.tail, idx - 1);
-}
-
-template <>
-int64_t indexer<0>(const Dim<0> &dim, int idx) {
-  PADDLE_MOBILE_THROW_EXCEPTION("Invalid index")
-  return dim.head;
-}
-
-}  // namespace
-// Static access to constant Dim
-template <int i, int l>
-int64_t get(const Dim<l> &d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Static access to mutable Dim
-template <int i, int l>
-int64_t &get(Dim<l> &d) {
-  return DimGetter<i>::impl(d);
-}
-
-// Dynamic access to constant Dim
-template <int l>
-int64_t Dim<l>::operator[](int i) const {
-  //  std::cout << "l: " << l << std::endl;
-  return indexer(*this, i);
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-int64_t &Dim<l>::operator[](int i) {
-  return indexer(*this, i);
-}
-
-// Dynamic access to constant Dim
-inline int64_t Dim<0>::operator[](int i) const { return indexer(*this, i); }
-
-// Dynamic access to mutable Dim
-inline int64_t &Dim<0>::operator[](int i) { return indexer(*this, i); }
-
-// Dynamic access to constant Dim
-// without std::enable_if will try to instantiate this on get<0>(d)
-template <int l>
-typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l> &d, int i) {
-  return d[i];
-}
-
-// Dynamic access to mutable Dim
-template <int l>
-typename std::enable_if<(l > 0), int64_t &>::type get(Dim<l> &d, int i) {
-  return d[i];
-}
-
-// Dot product of two dims
-template <int i>
-int64_t linearize(const Dim<i> &a, const Dim<i> &b) {
-  return a.head * b.head + linearize(a.tail, b.tail);
-}
-
-// Base case dot product of two Dims
-// Notice it is inline because it is no longer a template
-template <>
-inline int64_t linearize(const Dim<0> &a, const Dim<0> &b) {
-  return 0;
-}
-
-// Product of a Dim
-template <int i>
-int64_t product(const Dim<i> &a, int prod = 1) {
-  return prod * a.head * product(a.tail);
-}
-
-// Base case product of a Dim
-// Notice it is inline because it is no longer a template
-template <>
-inline int64_t product(const Dim<0> &a, int prod) {
-  return prod;
-}
-
-// Is 0 <= idx_i < size_i for all i?
-template <int i>
-bool contained(const Dim<i> &idx, const Dim<i> &size) {
-  return ((0 <= idx.head) && (idx.head < size.head) &&
-          contained(idx.tail, size.tail));
-}
-
-// Base case of is 0 <= idx_i < size_i ?
-// Notice it is inline because it is no longer a template
-template <>
-inline bool contained(const Dim<0> &idx, const Dim<0> &size) {
-  return true;
-}
-
-/**
- * \brief Compute exclusive prefix-multiply of a Dim.
- */
-template <int i>
-Dim<i> ex_prefix_mul(const Dim<i> &src, int mul = 1) {
-  return Dim<i>(mul, ex_prefix_mul(src.tail, mul * src.head));
-}
-
-///\cond HIDDEN
-// Base case of ex_prefix_mul
-// Notice it is inline because it is no longer a template
-template <>
-inline Dim<0> ex_prefix_mul(const Dim<0> &src, int mul) {
-  return Dim<0>();
-}
-///\endcond
-
-/**
- * Add two dimensions together
- */
-template <int i>
-Dim<i> dim_plus(const Dim<i> &a, const Dim<i> &b) {
-  return Dim<i>(a.head + b.head, dim_plus(a.tail, b.tail));
-}
-
-// Base case
-template <>
-inline Dim<0> dim_plus(const Dim<0> &a, const Dim<0> &b) {
-  return Dim<0>();
-}
-
-template <int i>
-Dim<i> operator+(const Dim<i> &lhs, const Dim<i> &rhs) {
-  return dim_plus(lhs, rhs);
-}
-
-/**
- * Multiply two dimensions together
- */
-template <int i>
-Dim<i> dim_mult(const Dim<i> &a, const Dim<i> &b) {
-  return Dim<i>(a.head * b.head, dim_mult(a.tail, b.tail));
-}
-
-// Base case
-template <>
-inline Dim<0> dim_mult(const Dim<0> &a, const Dim<0> &b) {
-  return Dim<0>();
-}
-
-template <int i>
-Dim<i> operator*(const Dim<i> &lhs, const Dim<i> &rhs) {
-  return dim_mult(lhs, rhs);
-}
-
-/**
- * \brief Normalize strides to ensure any dimension with extent 1
- * has stride 0.
- *
- * \param size Dim object containing the size of an array
- * \param stride Dim object containing stride of an array
- * \return Dim object the same size as \p size with normalized strides
- *
- */
-
-template <int i>
-Dim<i> normalize_strides(const Dim<i> &size, const Dim<i> &stride) {
-  int norm_stride = size.head == 1 ? 0 : stride.head;
-  return Dim<i>(norm_stride, normalize_strides(size.tail, stride.tail));
-}
-
-///\cond HIDDEN
-
-template <>
-inline Dim<0> normalize_strides(const Dim<0> &size, const Dim<0> &stride) {
-  return Dim<0>();
-}
-
-///\endcond
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of
- * params
- *
- */
-
-template <typename... Args>
-Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp
deleted file mode 100644
index cda5c5522c961c70fc15bf76fcd650a17bb76835..0000000000000000000000000000000000000000
--- a/mobile/src/framework/executor.cpp
+++ /dev/null
@@ -1,1125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/executor.h"
-#include <algorithm>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "common/enforce.h"
-#include "common/log.h"
-#include "framework/context.h"
-#include "framework/framework.pb-c.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program-optimize/program_optimize.h"
-#include "framework/program/program_desc.h"
-#include "framework/program/var_desc.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "memory/t_malloc.h"
-#include "pass/memory_optimize.h"
-#include "pass/model_obfuscate.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#include "pass/memory_optimize_cl.h"
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-#pragma mark - executor
-
-template <typename Device, typename T>
-void Executor<Device, T>::SetThreadNum(int thread_num, PowerMode power_mode) {
-  CPUContext::Context()->set_thread_num(thread_num, power_mode);
-}
-
-template <typename Device, typename T>
-Executor<Device, T>::Executor(const Program<Device> &program,
-                              paddle_mobile::PaddleMobileConfigInternal config,
-                              int batch_size, const bool use_optimize,
-                              const bool lod_mode)
-    : program_(program),
-      batch_size_(batch_size),
-      use_optimize_(use_optimize),
-      lod_mode_(lod_mode),
-      config_(config) {
-  DLOG << "executor in lod mode: " << lod_mode;
-
-  Variable *variable_ptr = program_.scope->Var("batch_size");
-  variable_ptr->SetValue<int>(batch_size);
-
-  program_desc_ =
-      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
-  PADDLE_MOBILE_ENFORCE(program_desc_ != nullptr,
-                        "program_desc_ should not be nullptr");
-#if !defined(PADDLE_MOBILE_FPGA) && !defined(PADDLE_MOBILE_FPGA_KD) && \
-    !defined(PADDLE_MOBILE_CL)
-  if (config_.memory_optimization_level != NoMemoryOptimization) {
-    pass::MemoryOptPass()(program_desc_.get(), program_.scope.get(),
-                          config_.memory_optimization_level);
-  }
-#endif
-  // resize feed and fetch list
-  // should init feed and fetch variables before infer shape
-  InitFeedFetchList();
-  const auto &blocks = program_desc_->Blocks();
-  std::shared_ptr<BlockDesc> block_desc = blocks[0];
-  std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-  for (int j = 0; j < ops.size(); ++j) {
-    std::shared_ptr<OpDesc> op_desc = ops[j];
-    LOG(kLOG_INFO) << "create op[" << j << "]: " << op_desc->Type();
-
-    auto op_handler = OpRegistry<Device>::CreateOp(
-        op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-        op_desc->GetAttrMap(), program_.scope.get());
-    // infer shape to reshape inputs and outputs before predict,
-    // but for lod mode, it still need to infer shape in runtime
-    if (!lod_mode) {
-      op_handler->InferShape();
-    }
-    ops_of_block0_.push_back(op_handler);
-  }
-#ifdef PADDLE_MOBILE_FPGA_V2
-  InitQuantMemory();
-#endif
-  if (program_.combined) {
-    InitCombineMemory();
-  } else {
-    InitMemory();
-  }
-  int count = 0;
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops_of_block0_.size());
-  struct timespec ts;
-  int op_index = 0;
-#endif
-  for (auto &op_handler : ops_of_block0_) {
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-    LOG(kLOG_INFO) << "Initialize op[" << count++
-                   << "]: " << op_handler->Type();
-    if (op_handler->Type() == "feed" || op_handler->Type() == "fetch") {
-      op_handler->setPrePostType(config_.pre_post_type);
-    }
-    op_handler->Init();
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-    ++op_index;
-#endif
-  }
-#ifdef PADDLE_MOBILE_PROFILE
-  printf("================[ op init profile ]==================\n");
-  PrintProfile(profile);
-#endif
-  ApplyMemoryOptimise(config, lod_mode);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::ApplyMemoryOptimise(
-    const PaddleMobileConfigInternal &config, const bool lod_mode) const {}
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-void Executor<GPU_CL, float>::ApplyMemoryOptimise(
-    const PaddleMobileConfigInternal &config, const bool lod_mode) const {
-  if (!config.load_when_predict && !lod_mode &&
-      config_.memory_optimization_level != NoMemoryOptimization) {
-    pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(),
-                            config_.memory_optimization_level);
-  }
-}
-#endif
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitFeedFetchList() {
-  std::unordered_map<std::string, int> feed_indices, fetch_indices;
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &op_desc : block->Ops()) {
-      if (op_desc->Type() == "feed") {
-        std::string name = op_desc->Output("Out")[0];
-        feed_indices[name] = op_desc->GetAttr("col").Get<int>();
-      } else if (op_desc->Type() == "fetch") {
-        std::string name = op_desc->Input("X")[0];
-        fetch_indices[name] = op_desc->GetAttr("col").Get<int>();
-      }
-    }
-  }
-  feed_indices_.swap(feed_indices);
-  fetch_indices_.swap(fetch_indices);
-
-  auto *feed_var = program_.scope->Var("feed");
-  auto *feed_list = feed_var->template GetMutable<framework::LoDTensorArray>();
-  feed_list->resize(feed_indices_.size());
-
-  auto *fetch_var = program_.scope->Var("fetch");
-  auto *fetch_list =
-      fetch_var->template GetMutable<framework::LoDTensorArray>();
-  fetch_list->resize(fetch_indices_.size());
-}
-
-template <typename T>
-static void LoadMemInternal(void **in_data, void *out_data, int64_t size,
-                            bool quant_uint8 = false, int quant_fold = 1) {
-  char **data_buf = reinterpret_cast<char **>(in_data);
-  T *tensor_data = reinterpret_cast<T *>(out_data);
-  if (quant_uint8) {
-    const int minimal_fold_size = 2;
-    quant_fold = fmin(fmax(1, size / minimal_fold_size), quant_fold);
-    int step = fmax(size / quant_fold, 1);
-    int visited_fold = 0;
-    while (visited_fold * step < size) {
-      // should be moved into operator init function
-      float min_value;
-      float max_value;
-      memory::Copy(&min_value, *data_buf, sizeof(float));
-      memory::Copy(&max_value, *data_buf + sizeof(float), sizeof(float));
-      *data_buf += 2 * sizeof(float);
-      const float factor = (max_value - min_value) / 255.0;
-      const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data_buf);
-      int k = 0;
-      for (; k < step; ++k) {
-        int tensor_data_idx = visited_fold * step + k;
-        if (tensor_data_idx >= size) {
-          break;
-        }
-        tensor_data[tensor_data_idx] = uint8_data[k] * factor + min_value;
-      }
-      *data_buf += k * sizeof(uint8_t);
-      visited_fold++;
-    }
-  } else {
-    memory::Copy(tensor_data, *data_buf, size * sizeof(T));
-    *data_buf += size * sizeof(T);
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::LoadMemory(void **data,
-                                     const std::shared_ptr<VarDesc> var_desc,
-                                     LoDTensor *tensor) {
-  char **data_buf = reinterpret_cast<char **>(data);
-  // version
-  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
-  *data_buf += sizeof(uint32_t);
-  // lod information
-  // uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
-  uint64_t lod_level = 0;
-  memory::Copy(&lod_level, *data_buf, sizeof(uint64_t));
-  *data_buf += sizeof(uint64_t);
-
-  auto *lod = tensor->mutable_lod();
-  lod->resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
-    *data_buf += sizeof(uint64_t);
-    std::vector<size_t> tmp_dim(size / sizeof(size_t));
-    memory::Copy(tmp_dim.data(), *data_buf, size);
-    (*lod)[i] = std::move(tmp_dim);
-    *data_buf += size;
-  }
-  // tensor version
-  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
-  *data_buf += sizeof(uint32_t);
-  // tensor desc size
-  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
-  *data_buf += sizeof(int32_t);
-  // skip tensor desc
-  *data_buf += tensor_desc_size;
-
-  const TensorDesc &tensor_desc = var_desc->Tensor_desc();
-  tensor->Resize(make_ddim(tensor_desc.Dims()));
-  // parse tensor from stream
-  switch (tensor_desc.DataType()) {
-    case VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(
-          reinterpret_cast<void **>(data_buf),
-          reinterpret_cast<void *>(tensor->mutable_data<T>()), tensor->numel(),
-          program_.quantification, program_.quantification_fold);
-      break;
-    case VARTYPE_TYPE_INT8:
-      LoadMemInternal<int8_t>(
-          reinterpret_cast<void **>(data_buf),
-          reinterpret_cast<void *>(tensor->mutable_data<T>()), tensor->numel());
-      break;
-    case VARTYPE_TYPE_INT32:
-      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf),
-                           reinterpret_cast<void *>(tensor->mutable_data<T>()),
-                           tensor->numel());
-      break;
-    default:
-      LOG(kLOG_ERROR) << "data type is not supported";
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitMemory() {
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        }
-        DLOG << "init persistable var: " << var_desc->Name();
-        char *origin_data =
-            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
-        char *data = origin_data;
-        auto tensor = var->template GetMutable<LoDTensor>();
-        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
-        delete[] origin_data;
-      } else {
-        DLOG << "init no persistable var: " << var_desc->Name();
-        varInputMemory(var_desc, var);
-      }
-    }
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitCombineMemory() {
-  char *origin_data = nullptr;
-  bool self_alloc = false;
-  if (program_.combined_params_buf && program_.combined_params_len) {
-    origin_data = reinterpret_cast<char *>(
-        const_cast<uint8_t *>(program_.combined_params_buf));
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, program_.combined_params_len);
-    }
-  } else {
-    self_alloc = true;
-    origin_data = ReadFileToBuff(program_.para_path);
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, GetFileLength(program_.para_path));
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
-  char *data = origin_data;
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        }
-
-        DLOG << " init combine memory persistable: " << var_desc->Name();
-        auto tensor = var->template GetMutable<LoDTensor>();
-        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
-      } else {
-        DLOG << " init combine memory no persistable: " << var_desc->Name();
-        varInputMemory(var_desc, var);
-      }
-    }
-  }
-  if (self_alloc) {
-    delete[] origin_data;
-  }
-  LOG(kLOG_INFO) << "init combine memory finish";
-}
-
-static void ClearNoPersistableTensorArray(const framework::ProgramDesc *program,
-                                          framework::Scope *scope) {
-  for (const auto &block : program->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      if (!var_desc->Persistable() &&
-          var_desc->Type() == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
-        auto var = scope->Var(var_desc->Name());
-        auto array = var->template GetMutable<framework::LoDTensorArray>();
-        array->resize(1);
-      }
-    }
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitNoPersistableMemory(const Tensor &input_tensor) {
-  if (input_tensor.dims().size() != 4) {
-    return;
-  }
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (!var_desc->Persistable() &&
-          var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        DLOG << "InitNoPersistableMemory var " << var_desc->Name();
-        auto tensor = var->template GetMutable<LoDTensor>();
-        if (tensor->IsInitialized() && tensor->dims().size() == 4) {
-          // don't change user's input and avoid memory leaks
-          if (feed_indices_.find(var_desc->Name()) != feed_indices_.end()) {
-            break;
-          }
-          DDim tensor_dim = tensor->dims();
-          DDim new_dim =
-              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
-                         input_tensor.dims()[3]});
-          tensor->Resize(new_dim);
-          tensor->template mutable_data_new<T>();
-          DLOG << "var's tensor dims " << tensor_dim;
-          DLOG << "var's tensor new dims " << new_dim;
-        } else {
-          DLOG << "var's tensor is not Initialized ???";
-        }
-      }
-    }
-  }
-}
-
-template <typename Device, typename T>
-bool Executor<Device, T>::varInputMemory(
-    const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
-#ifdef PADDLE_MOBILE_FPGA
-  framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
-#ifdef PADDLE_MOBILE_FPGA_V2
-  tensor->init(type_id<int8_t>().hash_code());
-#else
-  tensor->init(type_id<float>().hash_code());
-#endif
-  return true;
-#endif
-
-  auto type = var_desc->Type();
-  if (type == VARTYPE_TYPE_LOD_TENSOR) {
-    auto data_type = var_desc->Tensor_desc().DataType();
-    framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
-  } else if (type == VARTYPE_TYPE_STEP_SCOPES) {
-    std::vector<framework::Scope *> *step_scopes =
-        var->template GetMutable<std::vector<framework::Scope *>>();
-  } else if (type == VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY) {
-    framework::LoDTensorArray *tensor_array =
-        var->template GetMutable<framework::LoDTensorArray>();
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION("got unhandled var type `%d`", type);
-  }
-  return true;
-}
-
-template <typename Device, typename T>
-PMStatus Executor<Device, T>::Predict(
-    const std::vector<std::pair<std::string, Tensor>> &inputs) {
-  for (const auto &input : inputs) {
-    SetInput(input.second, input.first);
-  }
-  return this->Predict();
-}
-
-template <typename Device, typename T>
-PMStatus Executor<Device, T>::Predict(
-    const std::vector<std::pair<std::string, LoDTensor>> &inputs) {
-  for (const auto &input : inputs) {
-    SetInput(input.second, input.first);
-  }
-  return this->Predict();
-}
-
-template <typename Device, typename T>
-std::vector<T> Executor<Device, T>::Predict(const std::vector<T> &input,
-                                            const std::vector<int64_t> &dims) {
-  PADDLE_MOBILE_ENFORCE(feed_indices_.size() != 0,
-                        "We don't know which tensor should be assign, since no "
-                        "feed op found in this model");
-  PADDLE_MOBILE_ENFORCE(fetch_indices_.size() != 0,
-                        "We don't know which tensor should be fetch out, since "
-                        "no fetch op found in this model");
-  std::string input_name = feed_indices_.begin()->first;
-  Tensor feed_tensor(input, make_ddim(dims));
-  SetInput(feed_tensor, input_name);
-  std::vector<T> output;
-  if (this->Predict() == PMSuccess) {
-    std::string output_name = fetch_indices_.begin()->first;
-    const auto output_tensor = GetOutput(output_name);
-    output.resize(output_tensor->numel());
-    memcpy(output.data(), output_tensor->template data<T>(),
-           output.size() * sizeof(T));
-  }
-  return output;
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::SetInput(const Tensor &input,
-                                   const std::string &var_name) {
-  int index = 0;
-  if (feed_indices_.find(var_name) != feed_indices_.end()) {
-    index = feed_indices_.find(var_name)->second;
-  }
-  auto *feed_var = program_.scope->Var("feed");
-  framework::LoDTensor &target =
-      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);
-
-  target.Resize(input.dims());
-  target.ShareDataWith(input);
-  if (feed_indices_.size() == 1) {
-    auto &dim = input.dims();
-    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
-      InitNoPersistableMemory(target);
-    }
-    input_dim_has_changed_ = input_dim_last_ != dim;
-    input_dim_last_ = static_cast<DDim>(dim);
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::SetInput(const LoDTensor &input,
-                                   const std::string &var_name) {
-  int index = 0;
-  if (feed_indices_.find(var_name) != feed_indices_.end()) {
-    index = feed_indices_.find(var_name)->second;
-  }
-  auto *feed_var = program_.scope->Var("feed");
-  framework::LoDTensor &target =
-      feed_var->template GetMutable<framework::LoDTensorArray>()->at(index);
-
-  target.Resize(input.dims());
-  target.ShareDataWith(input);
-  target.set_lod(input.lod());
-  if (feed_indices_.size() == 1) {
-    auto &dim = input.dims();
-    if (lod_mode_ && product(dim) < 0.9 * product(input_dim_last_)) {
-      InitNoPersistableMemory(target);
-    }
-    input_dim_has_changed_ = input_dim_last_ != dim;
-    input_dim_last_ = static_cast<DDim>(dim);
-  }
-}
-
-template <typename Device, typename T>
-std::shared_ptr<LoDTensor> Executor<Device, T>::GetOutput(
-    const std::string &var_name) {
-  const auto &iter = fetch_indices_.find(var_name);
-  if (var_name == "fetch" || iter != fetch_indices_.end()) {
-    int index = 0;
-    if (iter != fetch_indices_.end()) {
-      index = iter->second;
-    }
-    auto *fetch_var = program_.scope->Var("fetch");
-    framework::LoDTensor &target =
-        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(index);
-
-    return std::make_shared<LoDTensor>(target);
-  } else {
-    auto *fetch_var = program_.scope->Var(var_name);
-    framework::LoDTensor *target =
-        fetch_var->template GetMutable<framework::LoDTensor>();
-    return std::make_shared<LoDTensor>(*target);
-  }
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <typename Device, typename T>
-const CLImage *Executor<Device, T>::GetOutputImage(
-    const std::string &var_name) {
-  auto var = program_.scope->FindVar(var_name);
-  if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-    const CLImage *cl_image = var->template Get<framework::CLImage>();
-    return cl_image;
-  } else {
-    return nullptr;
-  }
-}
-#endif
-
-template <typename Device, typename T>
-PMStatus Executor<Device, T>::Predict() {
-  try {
-#if _OPENMP
-    omp_set_num_threads(CPUContext::Context()->get_thread_num());
-#endif
-    // clear all no persistable tensor array since write_to_array
-    // is always push back a new tensor in the array
-    ClearNoPersistableTensorArray(program_desc_.get(), program_.scope.get());
-
-#ifdef PADDLE_MOBILE_PROFILE
-    std::vector<ProfInfo> profile(ops_of_block0_.size());
-    struct timespec ts;
-    int op_index = 0;
-#endif
-    for (int i = 0; i < ops_of_block0_.size(); ++i) {
-      auto &op_handler = ops_of_block0_[i];
-#ifdef PADDLE_MOBILE_PROFILE
-      clock_gettime(CLOCK_MONOTONIC, &ts);
-      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-      LOG(paddle_mobile::kLOG_INFO) << i << "th, "
-                                    << "run op: " << op_handler->Type();
-      if (lod_mode_ && input_dim_has_changed_) {
-        op_handler->InferShape();
-      }
-      op_handler->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-      clock_gettime(CLOCK_MONOTONIC, &ts);
-      profile[op_index].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-      ++op_index;
-#endif
-    }
-    if (feed_indices_.size() == 1) {
-      input_dim_has_changed_ = false;
-    }
-
-#ifdef PADDLE_MOBILE_PROFILE
-    PrintProfile(profile);
-#endif
-    return PMSuccess;
-  } catch (PaddleMobileException &e) {
-    exception_msg_ = e.what();
-    return PMException;
-  } catch (std::exception &e) {
-    exception_msg_ = e.what();
-    return PMException;
-  }
-}
-
-#ifdef PADDLE_MOBILE_PROFILE
-template <typename Device, typename T>
-void Executor<Device, T>::PrintProfile(
-    const vector<Executor<Device, T>::ProfInfo> &profile) const {
-  std::unordered_map<std::string, uint64_t> _tp;
-  for (int i = 0; i < profile.size(); i++) {
-    const auto &pInfo = profile[i];
-    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (this->ops_of_block0_[i]->Type() == "conv2d" ||
-        this->ops_of_block0_[i]->Type() == "depthwise_conv2d") {
-      auto inputs = this->ops_of_block0_[i]->Inputs();
-
-      auto *filter = GetVarValue<ProfileTensorType>("Filter", inputs,
-                                                    *(this->program_.scope));
-      int kernel_size = filter->dims()[2];
-      _tp[this->ops_of_block0_[i]->Type() + "_" +
-          std::to_string(kernel_size)] += timeCost;
-    } else {
-      _tp[this->ops_of_block0_[i]->Type()] += timeCost;
-    }
-  }
-  printf("====================[ profile ]======================\n");
-  typedef std::pair<std::string, uint64_t> prof_t;
-  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
-  uint64_t _ptotal = 0;
-  for (auto const &p : _tv) {
-    _ptotal += p.second;
-  }
-  auto compf = [](const prof_t &a, const prof_t &b) {
-    return a.second > b.second;
-  };
-  std::sort(_tv.begin(), _tv.end(), compf);
-  _tv.push_back(std::make_pair("total", _ptotal));
-  for (auto const &p : _tv) {
-    printf("%-16s\t%-10.0f\t%-2.4f\n", p.first.c_str(),
-           static_cast<float>(p.second),
-           static_cast<float>(p.second) / _ptotal * 100.0);
-  }
-  printf("====================[---------]======================\n");
-}
-#endif
-
-template <typename Device, typename T>
-void Executor<Device, T>::FeedTensorData(const vector<framework::Tensor> &v) {
-  auto input_size = v.size();
-  auto *feed_var = program_.scope->Var("feed");
-
-  PADDLE_MOBILE_ENFORCE(input_size == feed_indices_.size(),
-                        "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    framework::LoDTensor &target =
-        feed_var->template GetMutable<framework::LoDTensorArray>()->at(i);
-    target.ShareDataWith(v[input_size - i - 1]);
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::GetTensorResults(
-    std::vector<framework::Tensor *> *v) {
-  auto *fetch_var = program_.scope->Var("fetch");
-  auto output_size = fetch_indices_.size();
-  for (int i = 0; i < output_size; i++) {
-    framework::LoDTensor &target =
-        fetch_var->template GetMutable<framework::LoDTensorArray>()->at(i);
-    v->push_back(&target);
-  }
-}
-
-template <typename Device, typename T>
-std::string Executor<Device, T>::GetExceptionMsg() {
-  return exception_msg_;
-}
-
-#ifdef PADDLE_MOBILE_FPGA
-template <typename Device, typename T>
-void Executor<Device, T>::InjectVariable(const Tensor &t,
-                                         std::string var_name) {
-  Variable *g_feed_value = program_.scope->Var(var_name);
-  Tensor *feed_tensor = g_feed_value->template GetMutable<LoDTensor>();
-  feed_tensor->Resize(t.dims());
-  feed_tensor->ShareDataWith(t);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::FeedData(const Tensor &t) {
-  InjectVariable(t, "feed0");
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::FeedData(const std::vector<void *> &v) {
-  auto input_size = v.size();
-  int index = 0;
-  // auto vars = program_.scope->VarContain("feed", &index);
-  // PADDLE_MOBILE_ENFORCE(input_size == vars.size(),
-  //                    "input data number not correct");
-  for (int i = 0; i < input_size; i++) {
-    auto var = program_.scope->Var("feed", i + index);
-    auto feed_tensor = var->template GetMutable<LoDTensor>();
-    feed_tensor->external_data = v[i];
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::GetResults(std::vector<void *> *v) {
-  auto output_size = v->size();
-  PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output");
-  int index = 0;
-  auto vars = program_.scope->VarContain("fetch", &index);
-  PADDLE_MOBILE_ENFORCE(output_size == vars.size(),
-                        "output data number not correct");
-
-  for (int i = 0; i < output_size; i++) {
-    auto var = program_.scope->Var("fetch", i + index);
-    auto fetch_tensor = var->template GetMutable<LoDTensor>();
-    (*v)[i] = fetch_tensor->template data<float>();
-  }
-}
-
-template <typename Device, typename T>
-framework::Tensor *Executor<Device, T>::GetTensorByName(
-    const std::string &name) {
-  auto var = program_.scope->Var(name);
-  return var->template GetMutable<LoDTensor>();
-}
-
-template <typename Device, typename T>
-std::shared_ptr<Tensor> Executor<Device, T>::FetchResult(int id) {
-  auto &ops = ops_of_block0_;
-
-  PADDLE_MOBILE_ENFORCE(id < (int)ops.size(), "Index out of range");
-  auto op = id < 0 ? ops[ops.size() - 1] : ops[id];
-  auto output_map = op->Outputs();
-  std::vector<std::string> out_keys = op->GetOutKeys();
-  PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output");
-  auto *output_tensor =
-      GetVarValue<LoDTensor>(out_keys[0], output_map, *(program_.scope));
-  return std::make_shared<Tensor>(Tensor(*output_tensor));
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::Predict_From_To(int start, int end) {
-  auto &ops = ops_of_block0_;
-  end = end < 0 ? static_cast<int>(ops.size()) : end;
-  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
-                        "start or end parameter is wrong");
-
-#ifdef PADDLE_MOBILE_PROFILE
-  std::vector<ProfInfo> profile(ops.size());
-#endif
-  for (int i = start; i < end; i++) {
-#ifdef PADDLE_MOBILE_PROFILE
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-    DLOG << "Running op: " << i << "  " << ops[i]->Type();
-    ops[i]->Run();
-
-#ifdef PADDLE_MOBILE_PROFILE
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-  }
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::Predict_From(int start) {
-  Predict_From_To(start);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::Predict_To(int end) {
-  Predict_From_To(0, end);
-}
-#ifdef PADDLE_MOBILE_FPGA_V2
-std::map<std::string, float> LoadQuantValFromFile(std::string filename) {
-  std::map<std::string, float> quantValList;
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    // std::cout << "open File Failed." << std::endl;
-    DLOG << "open File Failed.";
-    exit(-1);
-  }
-
-  std::string line;
-  while (getline(in, line)) {
-    std::string splitStr = " : ";
-    std::string::size_type pos;
-    pos = line.find(splitStr);
-    std::string subStr[2];
-    subStr[0] = line.substr(0, pos);
-    subStr[1] = line.substr(pos + splitStr.size(), line.size());
-    quantValList.insert(std::make_pair(subStr[0], atof(subStr[1].c_str())));
-  }
-  in.close();
-  return quantValList;
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::InitQuantMemory() {
-  std::string quantValFilePath;
-  if (program_.combined) {
-    quantValFilePath = program_.para_path;
-    quantValFilePath =
-        quantValFilePath.substr(0, (quantValFilePath.length() - 6));
-    quantValFilePath = quantValFilePath + "scale";
-  } else {
-    quantValFilePath = program_.model_path + "/scale";
-  }
-  std::map<std::string, float> quantValList =
-      LoadQuantValFromFile(quantValFilePath);
-  auto ops = ops_of_block0_;
-  for (int id = 0; id < ops.size(); id++) {
-    auto op = ops[id];
-    auto input_keys = op->GetInputKeys();
-    auto inputs = op->Inputs();
-    for (auto key = input_keys.begin(); key != input_keys.end(); key++) {
-      auto inputs_vars = inputs[*key];
-      int count = inputs_vars.size();
-      for (int i = 0; i < count; i++) {
-        if (inputs_vars[i] != "feed") {
-          auto tensor = GetTensorByName(inputs_vars[i]);
-          tensor->scale[0] = quantValList[inputs_vars[i]];
-          DLOG << "input variance name : " << inputs_vars[i]
-               << ", scale value : " << tensor->scale[0];
-        }
-      }
-    }
-    auto output_keys = op->GetOutKeys();
-    auto outputs = op->Outputs();
-    for (auto key = output_keys.begin(); key != output_keys.end(); key++) {
-      auto outputs_vars = outputs[*key];
-      int count = outputs_vars.size();
-      for (int i = 0; i < count; i++) {
-        if (outputs_vars[i] != "fetch") {
-          auto tensor = GetTensorByName(outputs_vars[i]);
-          tensor->scale[0] = quantValList[outputs_vars[i]];
-          DLOG << "output variance name : " << outputs_vars[i]
-               << ", scale value : " << tensor->scale[0];
-        }
-      }
-    }
-  }
-}
-#endif
-#endif
-#ifdef PADDLE_MOBILE_CL
-template <>
-void Executor<GPU_CL, float>::InitNoPersistableMemory(
-    const Tensor &input_tensor) {
-  DLOG << "CL InitNoPersistableMemory ";
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-
-      if (var_desc->Persistable()) {
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        }
-      } else {
-        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-          auto cl_image = var->template GetMutable<CLImage>();
-          cl_context context = program_.scope->GetCLScpoe()->Context();
-          cl_command_queue command_queue =
-              program_.scope->GetCLScpoe()->CommandQueue();
-
-          DDim tensor_dim = cl_image->dims();
-          DDim new_dim =
-              make_ddim({tensor_dim[0], tensor_dim[1], input_tensor.dims()[2],
-                         input_tensor.dims()[3]});
-          cl_image->Resize(new_dim);
-          cl_image->InitEmptyImage(context, command_queue, new_dim);
-        }
-      }
-    }
-  }
-  std::shared_ptr<LoDTensor> output = GetOutput("fetch");
-  output->Resize(input_tensor.dims());
-  output->mutable_data<float>();
-}
-
-template <>
-void Executor<GPU_CL, float>::SetInput(const Tensor &input,
-                                       const std::string &var_name) {
-  int index = 0;
-  if (feed_indices_.find(var_name) != feed_indices_.end()) {
-    index = feed_indices_.find(var_name)->second;
-  }
-  auto *feed_var = program_.scope->Var("feed");
-  framework::LoDTensor *input_tensor =
-      &(feed_var->template GetMutable<framework::LoDTensorArray>()->at(index));
-
-  DLOG << "config_.load_when_predict   " << config_.load_when_predict;
-  DLOG << "target_tensor->IsInitialized() " << input_tensor->IsInitialized();
-  DLOG << "target_tensor->dims()   " << input_tensor->dims();
-  DLOG << "input.dims()   " << input.dims();
-  DLOG << "input_dim_last_   " << input_dim_last_;
-  if (config_.load_when_predict) {
-    if (input_dim_last_ != input.dims()) {
-      DLOG << "SetInput ---- > resize1";
-      input_tensor->Resize(input.dims());
-      input_tensor->mutable_data<float>();
-      if (config_.memory_optimization_level == NoMemoryOptimization) {
-        InitNoPersistableMemory(*input_tensor);
-      } else {
-        pass::MemoryOptPassCl()(program_desc_.get(), program_.scope.get(),
-                                config_.memory_optimization_level,
-                                input.dims());
-      }
-    }
-  } else {
-    DLOG << "SetInput ---- > resize2";
-    input_tensor->Resize(input.dims());
-    DLOG << "SetInput ---- > ShareDataWith";
-  }
-  input_tensor->ShareDataWith(input);
-  if (feed_indices_.size() == 1) {
-    input_dim_has_changed_ = input_dim_last_ != input.dims();
-  }
-  auto &dim = input.dims();
-  input_dim_last_ = static_cast<DDim>(dim);
-}
-
-template <typename Device, typename T>
-void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
-                                     char **data) {}
-
-template <>
-void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
-                                         float *tensorInput, char **data) {
-  // 1. version
-  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
-
-  (*data) += sizeof(uint32_t);
-
-  // 2 Lod information
-  uint64_t *lod_level_ptr = new uint64_t();
-  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
-  uint64_t lod_level = *lod_level_ptr;
-  delete lod_level_ptr;
-  (*data) += sizeof(uint64_t);
-
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
-    (*data) += sizeof(uint64_t);
-    std::vector<size_t> tmp(size / sizeof(size_t));
-
-    for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *reinterpret_cast<size_t *>(*data);
-      (*data) += sizeof(size_t);
-    }
-  }
-
-  // 3. tensor version
-  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
-  (*data) += sizeof(uint32_t);
-
-  // 4. tensor desc
-  int32_t size = *reinterpret_cast<int32_t *>(*data);
-  (*data) += sizeof(int32_t);
-
-  std::unique_ptr<char[]> buf(new char[size]);
-  for (int m = 0; m < size; ++m) {
-    buf.get()[m] = (*data)[m];
-  }
-  (*data) += (sizeof(char) * size);
-
-  const TensorDesc &desc = var_desc.Tensor_desc();
-  int memory_size = 1;
-  for (auto l : desc.Dims()) {
-    memory_size *= l;
-  }
-
-  void *memory = nullptr;
-  int type_size = 4;
-  memory = tensorInput;
-
-  LoadMemInternal<float>(reinterpret_cast<void **>(data),
-                         reinterpret_cast<void *>(memory), memory_size,
-                         program_.quantification, program_.quantification_fold);
-}
-
-template <>
-void Executor<GPU_CL, float>::InitMemory() {
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        CLImage *cl_image = nullptr;
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        } else {
-          cl_image = var->template GetMutable<CLImage>();
-        }
-
-        char *origin_data =
-            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
-        char *data = origin_data;
-        cl_context context = program_.scope->GetCLScpoe()->Context();
-        const TensorDesc &desc = var_desc->Tensor_desc();
-        int numel = 1;
-        for (auto l : desc.Dims()) {
-          numel *= l;
-        }
-        DLOG << var_desc->Name();
-        float *tensorInput = static_cast<float *>(
-            paddle_mobile::memory::Alloc(sizeof(float) * numel));
-        LoadMemory(*var_desc, tensorInput, &data);
-
-        DDim ddim = make_ddim(desc.Dims());
-
-        // has not init
-        cl_image->SetTensorData(tensorInput, ddim);
-
-        delete origin_data;
-        paddle_mobile::memory::Free(tensorInput);
-      } else {
-        if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-          auto cl_image = var->template GetMutable<CLImage>();
-          cl_context context = program_.scope->GetCLScpoe()->Context();
-          cl_command_queue command_queue =
-              program_.scope->GetCLScpoe()->CommandQueue();
-
-          const TensorDesc &desc = var_desc->Tensor_desc();
-          //          DDim ddim = make_ddim(desc.Dims());
-          DDim ddim = cl_image->dims();
-          LOG(kLOG_DEBUG1) << "init image of " << var_desc->Name();
-          cl_image->InitEmptyImage(context, command_queue, ddim);
-        }
-      }
-    }
-  }
-}
-
-template <>
-void Executor<GPU_CL, float>::InitCombineMemory() {
-  DLOG << "CL InitCombineMemory---- "
-       << "config_.load_when_predict: " << config_.load_when_predict;
-  char *origin_data = nullptr;
-  bool self_alloc = false;
-  if (program_.combined_params_buf && program_.combined_params_len) {
-    LOG(kLOG_INFO) << "use outter memory";
-    origin_data = reinterpret_cast<char *>(program_.combined_params_buf);
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, program_.combined_params_len);
-    }
-  } else {
-    LOG(kLOG_INFO) << " begin init combine memory";
-    self_alloc = true;
-    origin_data = ReadFileToBuff(program_.para_path);
-    if (config_.model_obfuscate_key != "") {
-      auto obfuscator = pass::ModelObfuscatePass(config_.model_obfuscate_key);
-      obfuscator.convert_data(origin_data, GetFileLength(program_.para_path));
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
-  float *data = reinterpret_cast<float *>(origin_data);
-
-  for (const auto &block : program_desc_->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = program_.scope->Var(var_desc->Name());
-      if (var_desc->Persistable()) {
-        CLImage *cl_image = nullptr;
-        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-          var->template GetMutable<framework::LoDTensorArray>();
-          continue;
-        } else {
-          cl_image = var->template GetMutable<CLImage>();
-        }
-
-        cl_context context = program_.scope->GetCLScpoe()->Context();
-
-        const TensorDesc &desc = var_desc->Tensor_desc();
-        DDim ddim = make_ddim(desc.Dims());
-
-        int numel = 1;
-        for (int i = 0; i < ddim.size(); i++) {
-          numel = numel * ddim[i];
-        }
-        float *tensorInput = static_cast<float *>(
-            paddle_mobile::memory::Alloc(sizeof(float) * numel));
-        LoadMemory(*var_desc, tensorInput, &origin_data);
-
-        // has not init
-        cl_image->SetTensorData(tensorInput, ddim);
-
-        paddle_mobile::memory::Free(tensorInput);
-      } else {
-        auto cl_image = var->template GetMutable<CLImage>();
-        cl_context context = program_.scope->GetCLScpoe()->Context();
-        cl_command_queue command_queue =
-            program_.scope->GetCLScpoe()->CommandQueue();
-        const TensorDesc &desc = var_desc->Tensor_desc();
-        DDim ddim = cl_image->dims();
-        bool shouldResize = true;
-        if (ddim.size() > 4) {
-          for (int i = 0; i < ddim.size() - 4; ++i) {
-            if (ddim[i] != 0 && ddim[i] != 1) {
-              shouldResize = false;
-              break;
-            }
-          }
-          if (shouldResize) {
-            std::vector<int64_t> temp_intput_dims;
-            temp_intput_dims.reserve(static_cast<size_t>(4));
-            for (int i = ddim.size() - 4; i < ddim.size(); ++i) {
-              temp_intput_dims.push_back(ddim[i]);
-            }
-            ddim = framework::make_ddim(temp_intput_dims);
-          }
-        }
-        //  DDim ddim = make_ddim(desc.Dims());
-        cl_image->InitEmptyImage(context, command_queue, ddim);
-      }
-    }
-  }
-  if (self_alloc) {
-    delete data;
-  }
-  LOG(kLOG_INFO) << " end init combine memory ";
-}
-
-#endif
-
-template class Executor<CPU, float>;
-
-template class Executor<FPGA, float>;
-
-template class Executor<GPU_CL, float>;
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/executor.h b/mobile/src/framework/executor.h
deleted file mode 100644
index ebb16f697b39391cd5f405c565285c1bd37dfad5..0000000000000000000000000000000000000000
--- a/mobile/src/framework/executor.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-#include "common/types.h"
-#include "common/util.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/program.h"
-#include "framework/tensor.h"
-#include "framework/type_trait.h"
-#include "pass/memory_optimize.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device, typename T = float>
-class Executor {
- public:
-  Executor(const Program<Device> &program,
-           paddle_mobile::PaddleMobileConfigInternal config, int batch_size = 1,
-           const bool use_optimize = true, const bool lod_mode = false);
-
-  void SetThreadNum(int thread_num,
-                    PowerMode power_mode = PERFORMANCE_PRIORITY);
-
-  PMStatus Predict(const std::vector<std::pair<std::string, Tensor>> &inputs);
-  PMStatus Predict(
-      const std::vector<std::pair<std::string, LoDTensor>> &inputs);
-
-  std::vector<T> Predict(const std::vector<T> &input,
-                         const std::vector<int64_t> &dims);
-  PMStatus Predict();
-
-  void SetInput(const Tensor &input, const std::string &var_name);
-  void SetInput(const LoDTensor &input, const std::string &var_name);
-
-  std::shared_ptr<LoDTensor> GetOutput(const std::string &var_name);
-#ifdef PADDLE_MOBILE_CL
-  const CLImage *GetOutputImage(const std::string &var_name);
-#endif
-
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
-  std::string GetExceptionMsg();
-
-#ifdef PADDLE_MOBILE_FPGA
-  void InjectVariable(const Tensor &t, std::string var_name);
-  void FeedData(const Tensor &t);
-  void FeedData(const std::vector<void *> &v);
-  void GetResults(std::vector<void *> *v);
-  framework::Tensor *GetTensorByName(const std::string &name);
-  std::shared_ptr<Tensor> FetchResult(int id = -1);
-  void Predict_From_To(int start = 0, int end = -1);
-  void Predict_From(int start);
-  void Predict_To(int end);
-#ifdef PADDLE_MOBILE_FPGA_V2
-  void InitQuantMemory();
-#endif
-#endif
-
- protected:
-  Executor() = default;
-
-  bool varInputMemory(const std::shared_ptr<VarDesc> &var_desc,
-                      Variable *var) const;
-  void InitFeedFetchList();
-  void InitMemory();
-  void InitCombineMemory();
-  void InitNoPersistableMemory(const Tensor &input_tensor);
-  void LoadMemory(void **data, const std::shared_ptr<VarDesc> var_desc,
-                  LoDTensor *tensor);
-#ifdef PADDLE_MOBILE_CL
-  void LoadMemory(const VarDesc var_desc, float *tensorInput, char **data);
-#endif
-
-  int batch_size_;
-  bool use_optimize_;
-  bool lod_mode_;
-  PaddleMobileConfigInternal config_;
-  Program<Device> program_;
-  std::shared_ptr<ProgramDesc> program_desc_;
-  std::vector<std::shared_ptr<OperatorBase<Device>>> ops_of_block0_;
-  std::unordered_map<std::string, int> feed_indices_;
-  std::unordered_map<std::string, int> fetch_indices_;
-  std::string exception_msg_;
-
-  // for super resoltion
-  DDim input_dim_last_;
-  bool input_dim_has_changed_ = true;
-
-#ifdef PADDLE_MOBILE_PROFILE
-  typedef typename DtypeTensorTrait<Device>::gtype ProfileTensorType;
-
-  struct ProfInfo {
-    int tid = 0;
-    uint64_t runBegin = 0UL;
-    uint64_t runEnd = 0UL;
-  };
-
-  void PrintProfile(const vector<Executor<Device, T>::ProfInfo> &profile) const;
-#endif
-  void ApplyMemoryOptimise(const PaddleMobileConfigInternal &config,
-                           const bool lod_mode) const;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/framework.pb-c.cpp b/mobile/src/framework/framework.pb-c.cpp
deleted file mode 100644
index b8d76282ec1a8988887ee048c1626589f87c891b..0000000000000000000000000000000000000000
--- a/mobile/src/framework/framework.pb-c.cpp
+++ /dev/null
@@ -1,1465 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-/* Do not generate deprecated warnings for self */
-#ifndef PROTOBUF_C__NO_DEPRECATED
-#define PROTOBUF_C__NO_DEPRECATED
-#endif
-
-#include "framework.pb-c.h"
-void paddle_mobile__framework__proto__version__init(
-    PaddleMobile__Framework__Proto__Version *message) {
-  static const PaddleMobile__Framework__Proto__Version init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__Version *
-paddle_mobile__framework__proto__version__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__Version *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__version__descriptor, allocator, len,
-          data);
-}
-void paddle_mobile__framework__proto__version__free_unpacked(
-    PaddleMobile__Framework__Proto__Version *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__version__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
-          data);
-}
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message) {
-  static const PaddleMobile__Framework__Proto__OpProto init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpProto *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__op_proto__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
-  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message) {
-  static const PaddleMobile__Framework__Proto__VarType init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarType *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__var_type__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__var_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message) {
-  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__BlockDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__block_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
-  *message = init_value;
-}
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__ProgramDesc *)
-      PaddleMobile__Framework__protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-      (PaddleMobile__Framework__ProtobufCMessage *)message, allocator);
-}
-static const int64_t
-    paddle_mobile__framework__proto__version__version__default_value = 0ll;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__version__field_descriptors[1] = {
-        {
-            "version", 1, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__Version, has_version),
-            offsetof(PaddleMobile__Framework__Proto__Version, version), NULL,
-            &paddle_mobile__framework__proto__version__version__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__version__field_indices_by_name[] = {
-        0, /* field[0] = version */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__version__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 1}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__version__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.Version",
-        "Version",
-        "PaddleMobile__Framework__Proto__Version",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__Version),
-        1,
-        paddle_mobile__framework__proto__version__field_descriptors,
-        paddle_mobile__framework__proto__version__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__version__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__version__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[14] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     has_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "blocks_idx", 14, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     n_blocks_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, blocks_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "longs", 15, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_longs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, longs), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
-        8,  /* field[8] = b */
-        10, /* field[10] = block_idx */
-        12, /* field[12] = blocks_idx */
-        9,  /* field[9] = bools */
-        3,  /* field[3] = f */
-        6,  /* field[6] = floats */
-        2,  /* field[2] = i */
-        5,  /* field[5] = ints */
-        11, /* field[11] = l */
-        13, /* field[13] = longs */
-        0,  /* field[0] = name */
-        4,  /* field[4] = s */
-        7,  /* field[7] = strings */
-        1,  /* field[1] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 14}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpDesc__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        14,
-        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
-        2,
-        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
-        {
-            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
-        1, /* field[1] = arguments */
-        0, /* field[0] = parameter */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpDesc__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
-        2,
-        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
-        {
-            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
-            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
-            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        0, /* field[0] = inputs */
-        4, /* field[4] = is_target */
-        1, /* field[1] = outputs */
-        2, /* field[2] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 5}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc",
-        "OpDesc",
-        "PaddleMobile__Framework__Proto__OpDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc),
-        5,
-        paddle_mobile__framework__proto__op_desc__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[6] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_duplicable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_intermediate),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     intermediate),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_dispensable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reuse", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, reuse), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
-        1, /* field[1] = comment */
-        4, /* field[4] = dispensable */
-        2, /* field[2] = duplicable */
-        3, /* field[3] = intermediate */
-        0, /* field[0] = name */
-        5, /* field[5] = reuse */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 6}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpProto__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        6,
-        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
-                     has_generated),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
-        2, /* field[2] = comment */
-        3, /* field[3] = generated */
-        0, /* field[0] = name */
-        1, /* field[1] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
-        {1, 0}, {0, 4}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpProto__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
-        4,
-        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
-            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        4, /* field[4] = comment */
-        1, /* field[1] = inputs */
-        2, /* field[2] = outputs */
-        0, /* field[0] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 5}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto",
-        "OpProto",
-        "PaddleMobile__Framework__Proto__OpProto",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto),
-        5,
-        paddle_mobile__framework__proto__op_proto__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         n_dims),
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         dims),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = data_type */
-            1, /* field[1] = dims */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.TensorDesc",
-        "TensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         has_lod_level),
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
-        "LoDTensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
-        0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    has_lod_level),
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
-        "LoDTensorArrayDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
-        {
-            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     n_lod_tensor),
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = lod_tensor */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 1}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ReaderDesc",
-        "ReaderDesc",
-        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__reader_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         capacity),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = capacity */
-            0, /* field[0] = data_type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
-                                                                           1] =
-        {{1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ChannelDesc",
-        "ChannelDesc",
-        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__channel_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
-        {
-            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     n_element_type),
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     element_type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
-        {
-            0, /* field[0] = element_type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Tuple",
-        "Tuple",
-        "PaddleMobile__Framework__Proto__VarType__Tuple",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tuple__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[22] =
-        {
-            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
-             0},
-            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
-             1},
-            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
-             2},
-            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
-             3},
-            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
-             4},
-            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
-             5},
-            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
-             6},
-            {"LOD_TENSOR",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
-            {"SELECTED_ROWS",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
-             8},
-            {"FEED_MINIBATCH",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
-             9},
-            {"FETCH_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
-            {"STEP_SCOPES",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
-             11},
-            {"LOD_RANK_TABLE",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
-             12},
-            {"LOD_TENSOR_ARRAY",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
-             "ARRAY",
-             13},
-            {"PLACE_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
-            {"READER",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
-            {"CHANNEL",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
-            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
-            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
-             18},
-            {"SIZE_T",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T", 19},
-            {"UINT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8",
-             20},
-            {"INT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8",
-             21},
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 22}};
-static const PaddleMobile__Framework__ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[22] = {
-        {"BOOL", 0},           {"CHANNEL", 16},
-        {"FEED_MINIBATCH", 9}, {"FETCH_LIST", 10},
-        {"FP16", 4},           {"FP32", 5},
-        {"FP64", 6},           {"INT16", 1},
-        {"INT32", 2},          {"INT64", 3},
-        {"INT8", 21},          {"LOD_RANK_TABLE", 12},
-        {"LOD_TENSOR", 7},     {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},    {"RAW", 17},
-        {"READER", 15},        {"SELECTED_ROWS", 8},
-        {"SIZE_T", 19},        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},         {"UINT8", 20},
-};
-const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Type",
-        "Type",
-        "PaddleMobile__Framework__Proto__VarType__Type",
-        "paddle_mobile.framework.proto",
-        22,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        22,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
-            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
-            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
-            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
-            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
-        5, /* field[5] = channel */
-        2, /* field[2] = lod_tensor */
-        4, /* field[4] = reader */
-        1, /* field[1] = selected_rows */
-        3, /* field[3] = tensor_array */
-        6, /* field[6] = tuple */
-        0, /* field[0] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 7}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType",
-        "VarType",
-        "PaddleMobile__Framework__Proto__VarType",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType),
-        7,
-        paddle_mobile__framework__proto__var_type__field_descriptors,
-        paddle_mobile__framework__proto__var_type__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
-            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
-            NULL,
-            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
-        0, /* field[0] = name */
-        2, /* field[2] = persistable */
-        1, /* field[1] = type */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 3}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarDesc",
-        "VarDesc",
-        "PaddleMobile__Framework__Proto__VarDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarDesc),
-        3,
-        paddle_mobile__framework__proto__var_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
-        -1;
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
-        {
-            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
-            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
-            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     has_forward_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     forward_block_idx),
-            NULL,
-            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
-        4, /* field[4] = forward_block_idx */
-        0, /* field[0] = idx */
-        3, /* field[3] = ops */
-        1, /* field[1] = parent_idx */
-        2, /* field[2] = vars */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.BlockDesc",
-        "BlockDesc",
-        "PaddleMobile__Framework__Proto__BlockDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
-        5,
-        paddle_mobile__framework__proto__block_desc__field_descriptors,
-        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__block_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[2] = {
-        {
-            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
-            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "version", 2, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, version),
-            &paddle_mobile__framework__proto__version__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
-        0, /* field[0] = blocks */
-        1, /* field[1] = version */
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.ProgramDesc",
-        "ProgramDesc",
-        "PaddleMobile__Framework__Proto__ProgramDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        2,
-        paddle_mobile__framework__proto__program_desc__field_descriptors,
-        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__program_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__program_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const PaddleMobile__Framework__ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[12] = {
-        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
-        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
-        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
-        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
-        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
-        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
-        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
-        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
-        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
-        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
-        {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
-        {"LONGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS", 11},
-};
-static const PaddleMobile__Framework__ProtobufCIntRange
-    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 12}};
-static const PaddleMobile__Framework__ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[12] = {
-        {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
-        {"FLOAT", 1}, {"FLOATS", 4},  {"INT", 0},     {"INTS", 3},
-        {"LONG", 9},  {"LONGS", 11},  {"STRING", 2},  {"STRINGS", 5},
-};
-const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.AttrType",
-        "AttrType",
-        "PaddleMobile__Framework__Proto__AttrType",
-        "paddle_mobile.framework.proto",
-        12,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        12,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__attr_type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
diff --git a/mobile/src/framework/framework.pb-c.h b/mobile/src/framework/framework.pb-c.h
deleted file mode 100644
index 910963f1e6419c9cbadd2338c67bec97abf94dd6..0000000000000000000000000000000000000000
--- a/mobile/src/framework/framework.pb-c.h
+++ /dev/null
@@ -1,615 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
-#define PROTOBUF_C_framework_2eproto__INCLUDED
-
-#include <protobuf-c/protobuf-c.h>
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if PROTOBUF_C_VERSION_NUMBER < 1000000
-# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003001 < PROTOBUF_C_MIN_COMPILER_VERSION
-# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
-#endif
-
-typedef struct _PaddleMobile__Framework__Proto__Version
-    PaddleMobile__Framework__Proto__Version;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc
-    PaddleMobile__Framework__Proto__OpDesc;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
-    PaddleMobile__Framework__Proto__OpDesc__Attr;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
-    PaddleMobile__Framework__Proto__OpDesc__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto
-    PaddleMobile__Framework__Proto__OpProto;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
-    PaddleMobile__Framework__Proto__OpProto__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
-    PaddleMobile__Framework__Proto__OpProto__Attr;
-typedef struct _PaddleMobile__Framework__Proto__VarType
-    PaddleMobile__Framework__Proto__VarType;
-typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
-    PaddleMobile__Framework__Proto__VarType__TensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
-    PaddleMobile__Framework__Proto__VarType__Tuple;
-typedef struct _PaddleMobile__Framework__Proto__VarDesc
-    PaddleMobile__Framework__Proto__VarDesc;
-typedef struct _PaddleMobile__Framework__Proto__BlockDesc
-    PaddleMobile__Framework__Proto__BlockDesc;
-typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
-    PaddleMobile__Framework__Proto__ProgramDesc;
-
-/* --- enums --- */
-
-typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
-  /*
-   * Pod Types
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-  /*
-   * Tensor<size_t> is used in C++.
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T = 19,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8 = 20,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8 = 21,
-  /*
-   * Other types that may need additional descriptions
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-  /*
-   * Any runtime decided variable type is raw
-   * raw variables should manage their own allocations
-   * in operators like nccl_op
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
-      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
-} PaddleMobile__Framework__Proto__VarType__Type;
-typedef enum _PaddleMobile__Framework__Proto__AttrType {
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONGS =
-      11 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-} PaddleMobile__Framework__Proto__AttrType;
-
-/* --- messages --- */
-
-/*
- * Any incompatible changes to ProgramDesc and its dependencies should
- * raise the version defined version.h.
- * Serailization and Deserialization codes should be modified in a way
- * that supports old versions following the version and compatibility policy.
- */
-struct _PaddleMobile__Framework__Proto__Version {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  protobuf_c_boolean has_version;
-  int64_t version;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__version__descriptor) \
-    , 0, 0ll                                                   \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  protobuf_c_boolean has_i;
-  int32_t i;
-  protobuf_c_boolean has_f;
-  float f;
-  char *s;
-  size_t n_ints;
-  int32_t *ints;
-  size_t n_floats;
-  float *floats;
-  size_t n_strings;
-  char **strings;
-  protobuf_c_boolean has_b;
-  protobuf_c_boolean b;
-  size_t n_bools;
-  protobuf_c_boolean *bools;
-  protobuf_c_boolean has_block_idx;
-  int32_t block_idx;
-  protobuf_c_boolean has_l;
-  int64_t l;
-  size_t n_blocks_idx;
-  int32_t *blocks_idx;
-  size_t n_longs;
-  int64_t *longs;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL, 0, NULL \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Var {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *parameter;
-  size_t n_arguments;
-  char **arguments;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
-    , NULL, 0, NULL                                                 \
-  }
-
-/*
- * OpDesc describes an instance of a C++ framework::OperatorBase
- * derived class type.
- */
-struct _PaddleMobile__Framework__Proto__OpDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
-  protobuf_c_boolean has_is_target;
-  protobuf_c_boolean is_target;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__op_desc__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
-  }
-
-/*
- * VarProto describes the C++ type framework::Variable.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Var {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  char *comment;
-  protobuf_c_boolean has_duplicable;
-  protobuf_c_boolean duplicable;
-  protobuf_c_boolean has_intermediate;
-  protobuf_c_boolean intermediate;
-  protobuf_c_boolean has_dispensable;
-  protobuf_c_boolean dispensable;
-  char *reuse;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
-  {                                                                  \
-    PROTOBUF_C_MESSAGE_INIT(                                         \
-        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0, NULL                             \
-  }
-
-/*
- * AttrProto describes the C++ type Attribute.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Attr {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  char *comment;
-  /*
-   * If that attribute is generated, it means the Paddle third
-   * language binding has responsibility to fill that
-   * attribute. End-User should not set that attribute.
-   */
-  protobuf_c_boolean has_generated;
-  protobuf_c_boolean generated;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
-  {                                                                     \
-    PROTOBUF_C_MESSAGE_INIT(                                            \
-        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
-  }
-
-/*
- * OpProto describes a C++ framework::OperatorBase derived class.
- */
-struct _PaddleMobile__Framework__Proto__OpProto {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
-  char *comment;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__op_proto__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  /*
-   * Should only be PODType. Is enforced in C++
-   */
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  /*
-   * [UNK, 640, 480] is saved as [-1, 640, 480]
-   */
-  size_t n_dims;
-  int64_t *dims;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
-  {                                                                              \
-    PROTOBUF_C_MESSAGE_INIT(                                                     \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
-    , NULL, 0, 0                                                                 \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
-  {                                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                                           \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
-    , NULL, 0, 0                                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  size_t n_lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
-    , 0, NULL                                                                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  int64_t capacity;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
-  {                                                                           \
-    PROTOBUF_C_MESSAGE_INIT(                                                  \
-        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__Tuple {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  size_t n_element_type;
-  PaddleMobile__Framework__Proto__VarType__Type *element_type;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
-  {                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                           \
-        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
-    , 0, NULL                                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type type;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
-  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
-  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
-  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__var_type__descriptor)                \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
-        NULL, NULL, NULL                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__VarType *type;
-  protobuf_c_boolean has_persistable;
-  protobuf_c_boolean persistable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__var_desc__descriptor) \
-    , NULL, NULL, 0, 0                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__BlockDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  int32_t idx;
-  int32_t parent_idx;
-  size_t n_vars;
-  PaddleMobile__Framework__Proto__VarDesc **vars;
-  size_t n_ops;
-  PaddleMobile__Framework__Proto__OpDesc **ops;
-  protobuf_c_boolean has_forward_block_idx;
-  int32_t forward_block_idx;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
-  {                                                               \
-    PROTOBUF_C_MESSAGE_INIT(                                      \
-        &paddle_mobile__framework__proto__block_desc__descriptor) \
-    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
-  }
-
-/*
- * Please refer to
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
- * for more details.
- * TODO(panyx0718): A model can have multiple programs. Need a
- * way to distinguish them. Maybe ID or name?
- */
-struct _PaddleMobile__Framework__Proto__ProgramDesc {
-  PaddleMobile__Framework__ProtobufCMessage base;
-  size_t n_blocks;
-  PaddleMobile__Framework__Proto__BlockDesc **blocks;
-  PaddleMobile__Framework__Proto__Version *version;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL, NULL                                                 \
-  }
-
-/* PaddleMobile__Framework__Proto__Version methods */
-void paddle_mobile__framework__proto__version__init(
-    PaddleMobile__Framework__Proto__Version *message);
-PaddleMobile__Framework__Proto__Version *
-paddle_mobile__framework__proto__version__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__version__free_unpacked(
-    PaddleMobile__Framework__Proto__Version *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
-/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message);
-/* PaddleMobile__Framework__Proto__OpDesc methods */
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message);
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpProto__Var methods */
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message);
-/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message);
-/* PaddleMobile__Framework__Proto__OpProto methods */
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message);
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message);
-/* PaddleMobile__Framework__Proto__VarType methods */
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message);
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarDesc methods */
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message);
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__BlockDesc methods */
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message);
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__ProgramDesc methods */
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message);
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-/* --- per-message closures --- */
-
-typedef void (*PaddleMobile__Framework__Proto__Version_Closure)(
-    const PaddleMobile__Framework__Proto__Version *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
-    void *closure_data);
-typedef void (
-    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
-    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
-    const PaddleMobile__Framework__Proto__BlockDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message,
-    void *closure_data);
-
-/* --- services --- */
-
-/* --- descriptors --- */
-
-extern const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__version__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor;
-extern const PaddleMobile__Framework__ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor;
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor;
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/mobile/src/framework/framework.proto b/mobile/src/framework/framework.proto
deleted file mode 100644
index 27a98e0d6178b0fb20dcf640635413691efb7f10..0000000000000000000000000000000000000000
--- a/mobile/src/framework/framework.proto
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle_mobile.framework.proto;
-
-// Any incompatible changes to ProgramDesc and its dependencies should
-// raise the version defined version.h.
-//
-// Serailization and Deserialization codes should be modified in a way
-// that supports old versions following the version and compatibility policy.
-message Version { optional int64 version = 1 [ default = 0 ]; }
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-  BLOCKS = 10;
-  LONGS = 11;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-    repeated int32 blocks_idx = 14;
-    repeated int64 longs = 15;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-    optional string reuse = 6;
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-    // Tensor<size_t> is used in C++.
-    SIZE_T = 19;
-    UINT8 = 20;
-    INT8 = 21;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    CHANNEL = 16;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
-
-  optional Version version = 2;
-}
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
deleted file mode 100755
index e04db5d1e8d6e2a75343cbee15269d607f71b7c9..0000000000000000000000000000000000000000
--- a/mobile/src/framework/load_ops.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_MOBILE_CPU
-#define LOAD_CPU_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##cpu();                     \
-  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##cpu()
-#else
-#define LOAD_CPU_OP(op_type)
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#define LOAD_GPU_CL_OP(op_type)                                       \
-  extern int TouchOpRegistrar_##op_type##_##cl();                     \
-  static int use_op_itself_##op_type##_##cl __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##cl()
-#else
-#define LOAD_GPU_CL_OP(op_type)
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#define LOAD_FPGA_OP(op_type)                                           \
-  extern int TouchOpRegistrar_##op_type##_##fpga();                     \
-  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##fpga()
-#else
-#define LOAD_FPGA_OP(op_type)
-#endif
-
-#define LOAD_FUSION_MATCHER(op_type)                                       \
-  extern int TouchFusionMatcherRegistrar_##op_type();                      \
-  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
-      TouchFusionMatcherRegistrar_##op_type();
-
-#define LOAD_OP(op_type)   \
-  LOAD_CPU_OP(op_type);    \
-  LOAD_GPU_CL_OP(op_type); \
-  LOAD_FPGA_OP(op_type);
-
-#define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
-
-#define LOAD_OP2(op_type, device_type1, device_type2) \
-  LOAD_OP1(op_type, device_type1)                     \
-  LOAD_OP1(op_type, device_type2)
-
-#define LOAD_OP3(op_type, device_type1, device_type2, device_type3) \
-  LOAD_OP2(op_type, device_type1, device_type2)                     \
-  LOAD_OP1(op_type, device_type3)
-
-// load requared ops
-LOAD_OP(feed)
-LOAD_OP(fetch)
-#ifdef FILL_CONSTANT_OP
-LOAD_OP2(fill_constant, CPU, FPGA)
-#endif
-#ifdef BATCHNORM_OP
-LOAD_OP2(batch_norm, CPU, GPU_CL);
-#endif
-#ifdef INSTANCENORM_OP
-LOAD_OP1(instance_norm, GPU_CL);
-#endif
-#ifdef BILINEAR_INTERP_OP
-LOAD_OP1(bilinear_interp, CPU);
-#endif
-#ifdef NEAREST_INTERP_OP
-LOAD_OP1(nearest_interp, CPU);
-#endif
-#ifdef LEAKY_RELU_OP
-LOAD_OP1(leaky_relu, CPU);
-#endif
-#ifdef BOXCODER_OP
-LOAD_OP2(box_coder, CPU, GPU_CL);
-#endif
-#ifdef CONCAT_OP
-LOAD_OP3(concat, CPU, GPU_CL, FPGA);
-#endif
-#ifdef CONV_OP
-LOAD_OP3(conv2d, CPU, GPU_CL, FPGA);
-#endif
-#ifdef LRN_OP
-LOAD_OP2(lrn, CPU, GPU_CL);
-#endif
-#ifdef SIGMOID_OP
-LOAD_OP1(sigmoid, CPU);
-#endif
-#ifdef FUSION_FC_RELU_OP
-LOAD_OP2(fusion_fc_relu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_fc_relu);
-#endif
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-LOAD_OP2(fusion_elementwise_add_relu, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
-#endif
-#ifdef SPLIT_OP
-LOAD_OP2(split, CPU, GPU_CL);
-#endif
-#ifdef RESIZE_OP
-LOAD_OP1(resize, CPU);
-#endif
-#ifdef FUSION_CONVADDBNRELU_OP
-LOAD_OP3(fusion_conv_add_bn_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
-#endif
-#ifdef RESHAPE_OP
-LOAD_OP2(reshape, CPU, GPU_CL);
-#endif
-#ifdef RESHAPE2_OP
-LOAD_OP2(reshape2, CPU, GPU_CL);
-#endif
-#ifdef TRANSPOSE_OP
-LOAD_OP2(transpose, CPU, GPU_CL);
-#endif
-#ifdef TRANSPOSE2_OP
-LOAD_OP2(transpose2, CPU, GPU_CL);
-#endif
-#ifdef PRIORBOX_OP
-LOAD_OP2(prior_box, CPU, GPU_CL);
-#endif
-#ifdef DENSITY_PRIORBOX_OP
-LOAD_OP2(density_prior_box, CPU, GPU_CL);
-#endif
-#ifdef FUSION_CONVADDRELU_OP
-LOAD_OP3(fusion_conv_add_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_relu);
-#endif
-#ifdef FUSION_CONVADD_OP
-LOAD_OP2(fusion_conv_add, CPU, GPU_CL);
-LOAD_FUSION_MATCHER(fusion_conv_add);
-#endif
-#ifdef SOFTMAX_OP
-LOAD_OP2(softmax, CPU, GPU_CL);
-#endif
-#ifdef SHAPE_OP
-LOAD_OP1(shape, CPU);
-#endif
-#ifdef DEPTHWISECONV_OP
-LOAD_OP2(depthwise_conv2d, CPU, GPU_CL);
-#endif
-#ifdef CONV_TRANSPOSE_OP
-LOAD_OP2(conv2d_transpose, CPU, GPU_CL);
-#endif
-#ifdef SCALE_OP
-LOAD_OP2(scale, CPU, GPU_CL);
-#endif
-#ifdef ELEMENTWISEADD_OP
-LOAD_OP2(elementwise_add, CPU, GPU_CL);
-#endif
-#ifdef PRELU_OP
-LOAD_OP1(prelu, CPU);
-#endif
-#ifdef TANH_OP
-LOAD_OP2(tanh, CPU, GPU_CL);
-#endif
-#ifdef FLATTEN_OP
-LOAD_OP1(flatten, CPU);
-#endif
-#ifdef FLATTEN2_OP
-LOAD_OP2(flatten2, CPU, GPU_CL);
-#endif
-#ifdef FUSION_CONVBNADDRELU_OP
-LOAD_OP3(fusion_conv_bn_add_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu);
-#endif
-#ifdef FUSION_CONVBNRELU_OP
-LOAD_OP3(fusion_conv_bn_relu, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
-#endif
-#ifdef FUSION_CONVRELU_OP
-LOAD_OP2(fusion_conv_relu, CPU, GPU_CL);
-LOAD_FUSION_MATCHER(fusion_conv_relu);
-#endif
-#ifdef GRU_OP
-LOAD_OP1(gru, CPU);
-#endif
-#ifdef GRU_UNIT_OP
-LOAD_OP1(gru_unit, CPU);
-#endif
-#ifdef FUSION_CONVADDBN_OP
-LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_add_bn);
-#endif
-#ifdef DROPOUT_OP
-LOAD_OP3(dropout, CPU, GPU_CL, FPGA);
-#endif
-#ifdef FUSION_DWCONVBNRELU_OP
-LOAD_OP2(fusion_dwconv_bn_relu, CPU, GPU_CL);
-LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
-#endif
-#ifdef CRF_OP
-LOAD_OP1(crf_decoding, CPU);
-#endif
-#ifdef MUL_OP
-LOAD_OP2(mul, CPU, GPU_CL);
-#endif
-#ifdef NORM_OP
-LOAD_OP1(norm, CPU);
-#endif
-#ifdef RELU_OP
-LOAD_OP2(relu, CPU, GPU_CL);
-LOAD_OP2(relu6, CPU, GPU_CL);
-#endif
-#ifdef IM2SEQUENCE_OP
-LOAD_OP1(im2sequence, CPU);
-#endif
-#ifdef LOOKUP_OP
-LOAD_OP1(lookup_table, CPU);
-#endif
-#ifdef FUSION_FC_OP
-LOAD_OP3(fusion_fc, CPU, GPU_CL, FPGA);
-LOAD_FUSION_MATCHER(fusion_fc);
-#endif
-#ifdef POOL_OP
-LOAD_OP3(pool2d, CPU, GPU_CL, FPGA);
-#endif
-#ifdef MULTICLASSNMS_OP
-LOAD_OP2(multiclass_nms, CPU, GPU_CL);
-#endif
-#ifdef POLYGONBOXTRANSFORM_OP
-LOAD_OP1(polygon_box_transform, CPU);
-#endif
-#ifdef SUM_OP
-LOAD_OP1(sum, CPU);
-#endif
-#ifdef ELEMENTWISEMUL_OP
-LOAD_OP1(elementwise_mul, CPU);
-#endif
-#ifdef SLICE_OP
-LOAD_OP1(slice, CPU);
-#endif
-#ifdef FUSION_CONVBN_OP
-LOAD_OP2(fusion_conv_bn, CPU, FPGA);
-LOAD_FUSION_MATCHER(fusion_conv_bn);
-#endif
-#ifdef ELEMENTWISESUB_OP
-LOAD_OP2(elementwise_sub, CPU, GPU_CL)
-#endif
-#ifdef TOP_K_OP
-LOAD_OP1(top_k, CPU)
-#endif
-#ifdef CAST_OP
-LOAD_OP1(cast, CPU)
-#endif
-#ifdef QUANT_OP
-LOAD_OP1(quantize, CPU);
-#endif
-#ifdef DEQUANT_OP
-LOAD_OP1(dequantize, CPU);
-#endif
-#ifdef FUSION_DEQUANT_BN_OP
-LOAD_OP1(fusion_dequant_bn, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_bn);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-LOAD_OP1(fusion_dequant_add_bn, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn);
-#endif
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-LOAD_OP1(fusion_dequant_bn_relu, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_bn_relu);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-LOAD_OP1(fusion_dequant_add_bn_relu, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-LOAD_OP1(fusion_dequant_add_bn_quant, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn_quant);
-#endif
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-LOAD_OP1(fusion_dequant_add_bn_relu_quant, CPU);
-LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant);
-#endif
-#ifdef SEQUENCE_EXPAND_OP
-LOAD_OP1(sequence_expand, CPU);
-#endif
-#ifdef SEQUENCE_POOL_OP
-LOAD_OP1(sequence_pool, CPU);
-#endif
-#ifdef SEQUENCE_SOFTMAX_OP
-LOAD_OP1(sequence_softmax, CPU);
-#endif
-#ifdef LOG_OP
-LOAD_OP1(log, CPU);
-#endif
-#ifdef LOD_RESET_OP
-LOAD_OP1(lod_reset, CPU);
-#endif
-#ifdef LESS_THAN_OP
-LOAD_OP1(less_than, CPU);
-#endif
-#ifdef LOGICAL_AND_OP
-LOAD_OP1(logical_and, CPU);
-#endif
-#ifdef LOGICAL_OR_OP
-LOAD_OP1(logical_or, CPU);
-#endif
-#ifdef LOGICAL_NOT_OP
-LOAD_OP1(logical_not, CPU);
-#endif
-#ifdef LOGICAL_XOR_OP
-LOAD_OP1(logical_xor, CPU);
-#endif
-#ifdef WHILE_OP
-LOAD_OP1(while, CPU);
-#endif
-#ifdef WRITE_TO_ARRAY_OP
-LOAD_OP1(write_to_array, CPU);
-#endif
-#ifdef READ_FROM_ARRAY_OP
-LOAD_OP1(read_from_array, CPU);
-#endif
-#ifdef IS_EMPTY_OP
-LOAD_OP1(is_empty, CPU);
-#endif
-#ifdef INCREMENT_OP
-LOAD_OP1(increment, CPU);
-#endif
-#ifdef ANCHOR_GENERATOR_OP
-LOAD_OP1(anchor_generator, CPU);
-#endif
-#ifdef PROPOSAL_OP
-LOAD_OP1(generate_proposals, CPU);
-#endif
-#ifdef PSROI_POOL_OP
-LOAD_OP1(psroi_pool, CPU);
-#endif
-#ifdef ROI_PERSPECTIVE_OP
-LOAD_OP1(roi_perspective_transform, CPU);
-#endif
-#ifdef BEAM_SEARCH_OP
-LOAD_OP1(beam_search, CPU);
-#endif
-#ifdef BEAM_SEARCH_DECODE_OP
-LOAD_OP1(beam_search_decode, CPU);
-#endif
-#ifdef PAD2D_OP
-LOAD_OP1(pad2d, CPU);
-#endif
-#ifdef ONE_HOT_OP
-LOAD_OP1(one_hot, CPU);
-#endif
-#ifdef ASSIGN_VALUE_OP
-LOAD_OP2(assign_value, CPU, GPU_CL);
-#endif
-#ifdef EXP_OP
-LOAD_OP1(exp, CPU);
-#endif
-#ifdef ASSIGN_OP
-LOAD_OP1(assign, CPU);
-#endif
-#ifdef CONDITIONAL_BLOCK_OP
-LOAD_OP1(conditional_block, CPU);
-#endif
-#ifdef EQUAL_OP
-LOAD_OP1(equal, CPU);
-#endif
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-LOAD_OP1(fill_constant_batch_size_like, CPU);
-#endif
-#ifdef RANGE_OP
-LOAD_OP1(range, CPU);
-#endif
-#ifdef REDUCE_PROD_OP
-LOAD_OP1(reduce_prod, CPU);
-#endif
-#ifdef PIXEL_SHUFFLE_OP
-LOAD_OP1(pixel_shuffle, GPU_CL);
-#endif
-#ifdef EXPAND_OP
-LOAD_OP1(expand, GPU_CL);
-#endif
-#ifdef GRID_SAMPLER_OP
-LOAD_OP1(grid_sampler, GPU_CL);
-#endif
diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp
deleted file mode 100644
index 2e422a3b327683989a08757fd287a370d6185d1f..0000000000000000000000000000000000000000
--- a/mobile/src/framework/loader.cpp
+++ /dev/null
@@ -1,310 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/loader.h"
-#include <memory>
-
-#include "framework/lod_tensor.h"
-#include "framework/program/program-optimize/program_optimize.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device, typename T>
-void Loader<Device, T>::InitMemoryFromProgram(
-    const std::shared_ptr<ProgramDesc> &originProgramDesc,
-    const std::shared_ptr<Scope> &scope) {
-  for (const auto &block : originProgramDesc.get()->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = scope.get()->Var(var_desc->Name());
-      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable()) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->GetMutable<LoDTensor>();
-          tensor->Resize(make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          if (dim.size() == 0) {
-            auto tensor = var->GetMutable<LoDTensor>();
-            framework::DDim dDim = {0};
-            tensor->Resize(dDim);
-          } else {
-            for (auto &d : dim) {
-              if (d < 0) {
-                d *= -1;
-              }
-            }
-            auto tensor = var->GetMutable<LoDTensor>();
-            tensor->Resize(make_ddim(dim));
-          }
-        }
-      } else {
-        // TODO(codeWorm)
-      }
-    }
-  }
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-void Loader<GPU_CL, float>::InitMemoryFromProgram(
-    const std::shared_ptr<ProgramDesc> &originProgramDesc,
-    const std::shared_ptr<Scope> &scope) {
-  for (const auto &block : originProgramDesc.get()->Blocks()) {
-    for (const auto &var_desc : block->Vars()) {
-      auto var = scope.get()->Var(var_desc->Name());
-      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable()) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto cl_image = var->GetMutable<framework::CLImage>();
-          cl_image->Resize(make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          PADDLE_MOBILE_ENFORCE(dim.size() > 0, "dim size is 0");
-          if (dim.size() == 0) {
-            auto tensor = var->GetMutable<LoDTensor>();
-            framework::DDim dDim = {0};
-            tensor->Resize(dDim);
-          } else {
-            for (auto &d : dim) {
-              if (d < 0) {
-                d *= -1;
-              }
-            }
-          }
-          auto cl_image = var->GetMutable<framework::CLImage>();
-          cl_image->Resize(make_ddim(dim));
-        }
-      } else {
-        // TODO(codeWorm)
-      }
-    }
-  }
-}
-template <>
-const Program<GPU_CL, float> Loader<GPU_CL, float>::LoadCombinedMemory(
-    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification,
-    int quantification_fold) {
-  bool can_add_split = false;
-
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      nullptr, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-
-  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-
-  Program<GPU_CL, float> program;
-  program.combined = true;
-  program.originProgram = originProgramDesc;
-  program.quantification = quantification;
-  program.combined_params_len = combined_params_len;
-  program.combined_params_buf = combined_params_buf;
-  program.quantification_fold = quantification_fold;
-
-  auto scope = std::make_shared<Scope>();
-  program.scope = scope;
-  InitMemoryFromProgram(originProgramDesc, scope);
-  if (optimize) {
-    ProgramOptimize program_optimize;
-    program.optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-    if (!program.optimizeProgram) {
-      program.optimizeProgram = originProgramDesc;
-    }
-  }
-  if (optimize) {
-    program.optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
-                                                               nullptr);
-  return program;
-}
-
-#endif
-
-/**
- * fusion and print someinfos
- * @tparam Device
- * @tparam P
- * @param optimize
- * @param can_add_split
- * @param program
- * @param originProgramDesc
- */
-template <typename Device, typename T>
-void FusionAndPrintInfos(
-    bool optimize, bool can_add_split, Program<Device, T> *program,
-    const std::shared_ptr<ProgramDesc> &originProgramDesc) {
-  if (optimize) {
-    ProgramOptimize program_optimize;
-    program->optimizeProgram =
-        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
-    if (!program->optimizeProgram) {
-      program->optimizeProgram = originProgramDesc;
-    }
-  }
-  if (optimize) {
-    program->optimizeProgram->Description("optimize: ");
-  } else {
-    originProgramDesc->Description("program: ");
-  }
-}
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != NULL, " %s open failed !", file_name);
-
-  fseek(fp, 0, SEEK_END);
-  size_t size = ftell(fp);
-  rewind(fp);
-
-  DLOG << "model size: " << size;
-  PADDLE_MOBILE_ENFORCE(size > 0, "model size should > 0")
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::Load(const std::string &dirname,
-                                                 bool optimize,
-                                                 bool quantification,
-                                                 bool can_add_split,
-                                                 int quantification_fold) {
-  auto program =
-      this->LoadProgram(dirname + "/__model__", optimize, quantification,
-                        can_add_split, quantification_fold);
-  program.model_path = dirname;
-  return program;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::Load(const std::string &model_path,
-                                                 const std::string &para_path,
-                                                 bool optimize,
-                                                 bool quantification,
-                                                 int quantification_fold) {
-  auto program = this->LoadProgram(model_path, optimize, quantification, false,
-                                   quantification_fold);
-
-  program.para_path = para_path;
-  program.combined = true;
-  program.quantification = quantification;
-  return program;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::LoadProgram(
-    const std::string &model_path, bool optimize, bool quantification,
-    bool can_add_split, int quantification_fold) {
-  std::string model_filename = model_path;
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  uint8_t *buf = NULL;
-  size_t read_size = ReadBuffer(model_filename.c_str(), &buf);
-
-  PADDLE_MOBILE_ENFORCE(buf != NULL, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      NULL, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != NULL, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-
-  Program<Device, T> program;
-  program.originProgram = originProgramDesc;
-  program.quantification = quantification;
-  program.combined_params_len = 0;
-  program.combined_params_buf = nullptr;
-  program.quantification_fold = quantification_fold;
-  auto scope = std::make_shared<Scope>();
-  program.scope = scope;
-
-  // use  originProgramDesc and scope to init tensors
-  InitMemoryFromProgram(originProgramDesc, scope);
-  // perform fusion and print infos
-  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
-
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program, NULL);
-  free(buf);
-  return program;
-}
-
-template <typename Device, typename T>
-const Program<Device, T> Loader<Device, T>::LoadCombinedMemory(
-    size_t read_size, const uint8_t *buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification,
-    int quantification_fold) {
-  bool can_add_split = false;
-
-  PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-  PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-
-  c_program = paddle_mobile__framework__proto__program_desc__unpack(
-      nullptr, read_size, buf);
-  //
-  PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-  //
-  DLOG << "n_ops: " << (*c_program->blocks)->n_ops;
-  //
-
-  auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-
-  Program<Device, T> program;
-  program.combined = true;
-  program.originProgram = originProgramDesc;
-  program.quantification = quantification;
-  program.combined_params_len = combined_params_len;
-  program.combined_params_buf = combined_params_buf;
-  program.quantification_fold = quantification_fold;
-
-  auto scope = std::make_shared<Scope>();
-  program.scope = scope;
-  InitMemoryFromProgram(originProgramDesc, scope);
-  FusionAndPrintInfos(optimize, can_add_split, &program, originProgramDesc);
-  paddle_mobile__framework__proto__program_desc__free_unpacked(c_program,
-                                                               nullptr);
-  return program;
-}
-
-template class Loader<CPU, float>;
-
-template class Loader<FPGA, float>;
-
-template class Loader<GPU_CL, float>;
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/loader.h b/mobile/src/framework/loader.h
deleted file mode 100644
index 40ded643d53396d1ba4f7964629b1580550b1895..0000000000000000000000000000000000000000
--- a/mobile/src/framework/loader.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "common/types.h"
-#include "framework/program/program.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device = CPU, typename T = float>
-class Loader {
- public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开存储的fluid模型
-   * */
-  const Program<Device, T> Load(const std::string &dirname,
-                                bool optimize = false,
-                                bool quantification = false,
-                                bool can_add_split = false,
-                                int quantification_fold = 1);
-
-  /*
-   * @b load combine format fluid mode
-   * @b 加载统一存储的fluid模型
-   * */
-  const Program<Device, T> Load(const std::string &model_path,
-                                const std::string &para_path,
-                                bool optimize = false,
-                                bool quantification = false,
-                                int quantification_fold = 1);
-
-  const Program<Device, T> LoadCombinedMemory(
-      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-      uint8_t *combined_params_buf, bool optimize = false,
-      bool quantification = false, int quantification_fold = 1);
-
- private:
-  const Program<Device, T> LoadProgram(const std::string &model_path,
-                                       bool optimize = false,
-                                       bool quantification = false,
-                                       bool can_add_split = false,
-                                       int quantification_fold = 1);
-
-  void InitMemoryFromProgram(
-      const std::shared_ptr<ProgramDesc> &originProgramDesc,
-      const std::shared_ptr<Scope> &scope);
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/lod_tensor.cpp b/mobile/src/framework/lod_tensor.cpp
deleted file mode 100644
index 0a1a6f881d0aaf20cc9ca6a109e2915464025aa4..0000000000000000000000000000000000000000
--- a/mobile/src/framework/lod_tensor.cpp
+++ /dev/null
@@ -1,192 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/lod_tensor.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace framework {
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end) {
-  PADDLE_MOBILE_ENFORCE(level < in.size(), "level should >= in.size()");
-  PADDLE_MOBILE_ENFORCE(elem_end < in[level].size(),
-                        "elem_end >= in[level].size()");
-  LoD res;
-  res.resize(in.size() - level);
-  // copy the first level
-  res[0].assign(in[level].begin() + elem_begin,
-                in[level].begin() + elem_end + 1);
-  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto &in_level = in[level + lvl];
-    const auto &above_level = res[lvl - 1];
-    auto &out_level = res[lvl];
-    out_level.assign(in_level.begin() + above_level.front(),
-                     in_level.begin() + above_level.back() + 1);
-  }
-  for (size_t lvl = 0; lvl < res.size(); lvl++) {
-    // to make the first offset equals 0, all the elements minus the
-    // first
-    // element
-    size_t front = res[lvl].front();
-    for (auto &ele : res[lvl]) {
-      ele -= front;
-    }
-  }
-  return res;
-}
-
-LoD ToAbsOffset(const LoD &in) {
-  // the lowest level stores relative offsets
-  if (in.empty() || in.size() == 1) return in;
-  LoD result = in;
-  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
-    for (size_t i = 0; i < in[level].size(); ++i) {
-      size_t index = in[level][i];
-      result[level][i] = result[level + 1][index];
-    }
-  }
-  return result;
-}
-
-bool operator==(const LoD &a, const LoD &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-
-  for (size_t i = 0; i < a.size(); i++) {
-    const auto &a_level = a[i];
-    const auto &b_level = b[i];
-    if (a_level.size() != b_level.size()) {
-      return false;
-    }
-    for (size_t j = 0; j < a_level.size(); j++) {
-      if (a_level[j] != b_level[j]) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool CheckLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: there should be more than 2 offsets existing in each
-    // level.
-    if (level.size() < 2) return false;
-    // check: the first offset(the begin offset) of each level
-    // should be 0.
-    if (level.front() != 0) return false;
-    // check: all the offsets in a level should be ascending(no same
-    // items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      PADDLE_MOBILE_THROW_EXCEPTION("ascending error")
-      return false;
-    }
-  }
-  // check: the lowest level's last offset should equals
-  // `tensor_height` if
-  //        tensor_height>0.
-  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
-    return false;
-
-  // check: the higher level's last offset should equals the lower
-  // level's
-  // size-1.
-  // NOTE LoD store the levels from top to bottom, so the higher level
-  // goes
-  // first.
-  for (size_t level = 0; level < in.size() - 1; level++) {
-    if (in[level].back() != in[level + 1].size() - 1) return false;
-  }
-  return true;
-}
-
-bool CheckAbsLoD(const LoD &in, int tensor_height) {
-  if (in.empty()) return true;
-  for (const auto &level : in) {
-    // check: all the offsets in a level should be ascending(no same
-    // items
-    // allows).
-    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
-          if (a < b) return true;
-          return false;
-        })) {
-      return false;
-    }
-
-    // check: there should be more than 2 offsets existing in each
-    // level.
-    if (level.size() < 2) return false;
-
-    // check: the first offset of each level should be 0, and the
-    // last should be
-    // the same(the height of underlying tensor).
-    if (level.front() != 0) return false;
-    if (tensor_height < 0) {
-      tensor_height = level.back();
-    } else if ((size_t)tensor_height != level.back()) {
-      return false;
-    }
-  }
-  return true;
-}
-
-using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
-                                        size_t end_idx, size_t start_level) {
-  LoD sub_lod;
-
-  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
-    PADDLE_MOBILE_ENFORCE(start_idx <= end_idx, "start_idx > end_idx");
-    PADDLE_MOBILE_ENFORCE(end_idx < lod[level_idx].size(),
-                          "end_idx >= lod[level_idx].size()");
-    std::vector<size_t> level_lens;
-    for (size_t i = start_idx; i < end_idx; ++i) {
-      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
-    }
-    sub_lod.emplace_back(level_lens);
-    start_idx = lod[level_idx][start_idx];
-    end_idx = lod[level_idx][end_idx];
-  }
-
-  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
-}
-
-void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_MOBILE_ENFORCE(
-      lod->empty() || lod->size() == lod_length.size(),
-      "The lod_length should has the same size with the appended lod.");
-  if (lod->empty()) {
-    for (size_t i = 0; i < lod_length.size(); ++i) {
-      lod->emplace_back(1, 0);  // size = 1, value = 0;
-    }
-    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-  }
-  for (size_t i = 0; i < lod->size(); ++i) {
-    auto &level = (*lod)[i];
-    for (size_t len : lod_length[i]) {
-      level.push_back(level.back() + len);
-    }
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/lod_tensor.h b/mobile/src/framework/lod_tensor.h
deleted file mode 100644
index 6d67b517ff38a3bb4b2ae6978de1290079596aa6..0000000000000000000000000000000000000000
--- a/mobile/src/framework/lod_tensor.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/tensor.h"
-#include "framework/tensor_util.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower
- * level
- * - the first element should be 0 and that indicates that this sequence
- * start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-using LoD = std::vector<std::vector<size_t>>;
-
-std::ostream &operator<<(std::ostream &os, const LoD &lod);
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t);
-
-std::string LoDToString(const LoD &lod);
-
-LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
-                 size_t elem_end);
-
-/*
- * Transform an LoD from relative offsets to absolute offsets.
- */
-LoD ToAbsOffset(const LoD &in);
-
-bool operator==(const LoD &a, const LoD &b);
-
-/*
- * Check whether this lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *
- *  1. all the offsets in a level should be ascending(no same items
- * allows).
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the higher level's last offset should equals the lower level's
- * size-1.
- *  4. the first offset(the begin offset) of each level should be 0.
- *  5. the lowest level's last offset should equals `tensor_height` if
- * tensor_height>0.
- */
-
-bool CheckLoD(const LoD &in, int tensor_height = -1);
-
-/*
- * Check whether this absolute lod's format is valid.
- *
- * ATTENTION:
- *   - Empty lod is treated as valid.
- *
- * It will check two things:
- *  1. all the offsets in a level should be ascending(no same items
- * allows)
- *  2. there should be more than 2 offsets existing in each level.
- *  3. the first offset of each level should be 0, and the last should
- * be the
- *     same(the height of underlying tensor) or `tensor_height` if
- *     tensor_height>0.
- */
-bool CheckAbsLoD(const LoD &in, int tensor_height = -1);
-
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
- public:
-  LoDTensor() : Tensor() {}
-
-  explicit LoDTensor(const LoD &lod) : lod_(lod) {}
-
-  void set_lod(const LoD &lod) { lod_ = lod; }
-
-  const LoD &lod() const { return lod_; }
-
-  LoD *mutable_lod() { return &lod_; }
-
-  /*
-   * Get the start offset and end offset of an  element from LoD.
-   */
-  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
-    //    PADDLE_ENFORCE_LT(level, NumLevels());
-    //    PADDLE_ENFORCE_LT(elem, NumElements(level));
-    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
-  }
-
-  /*
-   * Number of LoDTensor's levels, each level has units of data, for
-   * example,
-   * in the sentence's view, article, paragraph, sentence are 3
-   * levels.
-   */
-  size_t NumLevels() const { return lod_.size(); }
-
-  /*
-   * Number of elements in a level.
-   */
-  size_t NumElements(size_t level = 0) const {
-    //    PADDLE_ENFORCE_LT(level, NumLevels());
-    // the last offset is the end of last element
-    return (lod_)[level].size() - 1;
-  }
-
- private:
-  LoD lod_;
-};
-
-/*
- * Expand the `source` to fit the LoD of `lod`. For example, a `source`
- * LoDTensor is
- *  - LoD: [0, 2]
- *  - tensor: [a0, a1]
- * a `lod` is
- *  - LoD: [0 3 5]
- * returns a new LoDTensor
- *  - [a0 a0 a0 a1 a1]
- */
-template <typename T>
-LoDTensor LodExpand(const LoDTensor &source, const LoD &lod, size_t level) {
-  LoD abs_lod = ToAbsOffset(lod);
-  const auto &lod_level = lod[level];
-  size_t num_instances = source.dims()[0];
-
-  // new tensor
-  LoDTensor tensor;
-  tensor.set_lod(lod);
-  auto dims = source.dims();
-  dims[0] = lod_level.back();
-  tensor.Resize(dims);
-  tensor.mutable_data<T>();
-
-  //  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
-  for (size_t ins = 0; ins < num_instances; ins++) {
-    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      auto slice = tensor.Slice(elem, elem + 1);
-      TensorCopy(source.Slice(ins, ins + 1), &slice);
-    }
-  }
-  return tensor;
-}
-
-using LoDTensorArray = std::vector<LoDTensor>;
-
-// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
-// relative length of details for every levels(i.e., [start_level: ]).
-//
-// For example,
-//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
-//   start_level = 0
-//   start_idx = 1
-//   end_idx = 3
-//
-// Returns:
-//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
-//  pair<size_t, size_t> = {11, 24}
-std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
-    const LoD &lod, size_t start_idx, size_t end_idx, size_t start_level);
-
-void AppendLoD(LoD *lod, const LoD &lod_length);
-
-/*
- * Serialize/Desiralize LoDTensor to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream &os, const LoDTensor &tensor);
-
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor);
-
-#ifdef PADDLE_MOBILE_DEBUG
-inline Print &operator<<(Print &printer, const LoDTensor &tensor) {
-  printer << " dims: " << tensor.dims() << "\n";
-  int stride = tensor.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-#ifndef PADDLE_MOBILE_FPGA
-  for (int i = 0; i < tensor.numel(); i += stride) {
-    if (tensor.type() == type_id<float>()) {
-      printer << tensor.data<float>()[i] << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int64_t>()) {
-      printer << tensor.data<int64_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int8_t>()) {
-      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    } else if (tensor.type() == type_id<bool>()) {
-      printer << tensor.data<bool>()[i] << " ";
-    }
-  }
-#endif  // PADDLE_MOBILE_FPGA
-  return printer;
-}
-#endif  // PADDLE_MOBILE_DEBUG
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/mixed_vector.h b/mobile/src/framework/mixed_vector.h
deleted file mode 100644
index 6e46164fb791fdc28a9dddff4b38c3d3b346b5c7..0000000000000000000000000000000000000000
--- a/mobile/src/framework/mixed_vector.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <initializer_list>
-#include <vector>
-#include "framework/tensor.h"
-#include "framework/tensor_util.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-// Vector<T> implements the std::vector interface, and can get Data or
-// MutableData from any place. The data will be synced implicitly inside.
-template <typename T>
-class Vector {
- public:
-  using value_type = T;
-  // Default ctor. Create empty Vector
-  Vector() { InitEmpty(); }
-
-  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T& value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T* ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    }
-  }
-
-  // Ctor with init_list
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
-    }
-  }
-
-  // implicit cast from std::vector.
-  template <typename U>
-  Vector(const std::vector<U>& dat) {  // NOLINT
-    if (dat.size() == 0) {
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
-    }
-  }
-
-  // Copy ctor
-  Vector(const Vector<T>& other) { this->operator=(other); }
-
-  // Copy operator
-  Vector<T>& operator=(const Vector<T>& other) {
-    if (other.size() != 0) {
-      this->InitByIter(other.size(), other.begin(), other.end());
-    } else {
-      InitEmpty();
-    }
-    return *this;
-  }
-
-  // Move ctor
-  Vector(Vector<T>&& other) {
-    this->size_ = other.size_;
-    this->flag_ = other.flag_;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
-    }
-  }
-
-  // CPU data access method. Mutable.
-  T& operator[](size_t i) {
-    MutableCPU();
-    return const_cast<T*>(cpu_vec_.data<T>())[i];
-  }
-
-  // CPU data access method. Immutable.
-  const T& operator[](size_t i) const {
-    //    ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
-  }
-
-  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return size_; }
-
-  T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
-
-  T* end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
-
-  T& front() { return *begin(); }
-
-  T& back() {
-    auto it = end();
-    --it;
-    return *it;
-  }
-
-  const T* begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
-  }
-
-  const T* end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
-  }
-
-  const T* cbegin() const { return begin(); }
-
-  const T* cend() const { return end(); }
-
-  const T& back() const {
-    auto it = end();
-    --it;
-    return *it;
-  }
-
-  T* data() { return begin(); }
-
-  const T* data() const { return begin(); }
-
-  const T& front() const { return *begin(); }
-  // end of std::vector iterator methods
-
-  // assign this from iterator.
-  // NOTE: the iterator must support `end-begin`
-  template <typename Iter>
-  void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
-  }
-
-  // push_back. If the previous capacity is not enough, the memory will
-  // double.
-  void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
-      reserve((size_ + 1) << 1);
-    }
-    *end() = elem;
-    ++size_;
-  }
-
-  // extend a vector by iterator.
-  // NOTE: the iterator must support end-begin
-  template <typename It>
-  void Extend(It begin, It end) {
-    size_t pre_size = size_;
-    resize(pre_size + (end - begin));
-    T* ptr = this->begin() + pre_size;
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
-  }
-
-  // resize the vector
-  void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
-      MutableCPU();
-      Tensor cpu_tensor;
-      T* ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}));
-      const T* old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
-    }
-  }
-
-  // clear
-  void clear() {
-    size_ = 0;
-    flag_ = kDirty | kDataInCPU;
-  }
-
-  size_t capacity() const {
-    return cpu_vec_.memory_size() / SizeOfType(type_id<T>().hash_code());
-  }
-
-  // reserve data
-  void reserve(size_t size) {
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
-
-  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const {
-    std::vector<T> result;
-    result.resize(size());
-    std::copy(begin(), end(), result.begin());
-    return result;
-  }
-
-  bool operator==(const Vector<T>& other) const {
-    if (size() != other.size()) return false;
-    auto it1 = cbegin();
-    auto it2 = other.cbegin();
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
-  }
-
- private:
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-
-  template <typename Iter>
-  void InitByIter(size_t size, Iter begin, Iter end) {
-    T* ptr = this->cpu_vec_.template mutable_data<T>(
-        framework::make_ddim({static_cast<int64_t>(size)}));
-    for (size_t i = 0; i < size; ++i) {
-      *ptr++ = *begin++;
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
-  }
-
-  enum DataFlag {
-    kDataInCPU = 0x01,
-    kDataInCUDA = 0x02,
-    // kDirty means the data has been changed in one device.
-    kDirty = 0x10
-  };
-
-  void MutableCPU() { flag_ = kDirty | kDataInCPU; }
-
-  void UnsetFlag(int flag) const { flag_ &= ~flag; }
-  void SetFlag(int flag) const { flag_ |= flag; }
-
-  static T& EmptyDummy() {
-    static T dummy = T();
-    return dummy;
-  }
-
-  mutable int flag_;
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
-  size_t size_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_info.h b/mobile/src/framework/op_info.h
deleted file mode 100644
index c250f61664108b57cb5eba7411638fd4d5b8dc9b..0000000000000000000000000000000000000000
--- a/mobile/src/framework/op_info.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <string>
-#include "common/log.h"
-#include "common/type_define.h"
-#include "framework/scope.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class OperatorBase;
-
-template <typename Dtype>
-using OpCreator = std::function<framework::OperatorBase<Dtype> *(
-    const std::string & /*type*/, const VariableNameMap & /*inputs*/,
-    const VariableNameMap & /*outputs*/,
-    const framework::AttributeMap & /*attrs*/, framework::Scope * /*scope*/)>;
-
-template <typename Dtype>
-struct OpInfo {
-  OpCreator<Dtype> creator_;
-  const OpCreator<Dtype> &Creator() const {
-    PADDLE_MOBILE_ENFORCE(creator_ != nullptr,
-                          "Operator Creator has not been registered");
-    return creator_;
-  }
-};
-
-template <typename Dtype>
-class OpInfoMap {
- public:
-  static OpInfoMap<Dtype> *Instance() {
-    static OpInfoMap<Dtype> *s_instance = nullptr;
-    if (s_instance == nullptr) {
-      s_instance = new OpInfoMap();
-    }
-    return s_instance;
-  }
-
-  bool Has(const std::string &op_type) const {
-    return map_.find(op_type) != map_.end();
-  }
-
-  void Insert(const std::string &type, const OpInfo<Dtype> &info) {
-    PADDLE_MOBILE_ENFORCE(!Has(type), "Operator %s has been registered",
-                          type.c_str());
-    map_.insert({type, info});
-  }
-
-  const OpInfo<Dtype> &Get(const std::string &type) const {
-    auto op_info_ptr = GetNullable(type);
-    PADDLE_MOBILE_ENFORCE(op_info_ptr != nullptr,
-                          "Operator %s has not been registered", type.c_str());
-    return *op_info_ptr;
-  }
-
-  const OpInfo<Dtype> *GetNullable(const std::string &type) const {
-    auto it = map_.find(type);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &it->second;
-    }
-  }
-
-  const std::unordered_map<std::string, OpInfo<Dtype>> &map() const {
-    return map_;
-  }
-
-  std::unordered_map<std::string, OpInfo<Dtype>> *mutable_map() {
-    return &map_;
-  }
-
- private:
-  OpInfoMap() = default;
-  std::unordered_map<std::string, OpInfo<Dtype>> map_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_kernel_type.h b/mobile/src/framework/op_kernel_type.h
deleted file mode 100644
index fd59eb494df33ee39f65f8c325fabbef51195d47..0000000000000000000000000000000000000000
--- a/mobile/src/framework/op_kernel_type.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/data_layout.h"
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-struct OpKernelType {
-  struct Hash {
-    size_t operator()(const OpKernelType &key) const {
-      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
-      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
-
-      std::hash<int> hasher;
-      return hasher(data_type + data_layout);
-    }
-  };
-
-  // place, data_type, library_type kinds less than 2^8
-  constexpr static int LEFT_SHIFT = 8;
-
-  VarType_Type data_type_;
-  DataLayout data_layout_;
-
-  OpKernelType(VarType_Type data_type,
-               DataLayout data_layout = DataLayout::kAnyLayout)
-      : data_type_(data_type), data_layout_(data_layout) {}
-
-  bool operator==(const OpKernelType &o) const {
-    return data_type_ == o.data_type_ && data_layout_ == o.data_layout_;
-  }
-
-  bool operator!=(const OpKernelType &o) const { return !(*this == o); }
-};
-
-inline bool NeedTransformLayout(const DataLayout &l, const DataLayout &r) {
-  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
-}
-
-inline bool TransFromNeeded(const OpKernelType &l, const OpKernelType &r) {
-  return (l.data_type_ != r.data_type_) ||
-         NeedTransformLayout(l.data_layout_, r.data_layout_);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_proto_maker.h b/mobile/src/framework/op_proto_maker.h
deleted file mode 100644
index a41e65d357e8338dd74b8f9b4da5943b0986c277..0000000000000000000000000000000000000000
--- a/mobile/src/framework/op_proto_maker.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace framework {
-// this class not only make proto but also init attribute checkers.
-class OpProtoAndCheckerMaker {};
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/op_registry.h b/mobile/src/framework/op_registry.h
deleted file mode 100644
index 3897fc02c84486258bf9debbd16582e59f33e736..0000000000000000000000000000000000000000
--- a/mobile/src/framework/op_registry.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <tuple>
-
-#include "common/log.h"
-#include "common/type_define.h"
-#include "framework/op_info.h"
-#include "framework/operator.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Registrar {
- public:
-  void Touch() {}
-};
-
-template <typename Dtype, size_t I, bool at_end, typename... ARGS>
-class OperatorRegistrarRecursive;
-
-template <typename Dtype, typename... ARGS>
-struct OperatorRegistrar : public Registrar {
-  explicit OperatorRegistrar(const std::string& op_type) {
-    if (OpInfoMap<Dtype>::Instance()->Has(op_type)) {
-      LOG(paddle_mobile::kLOG_DEBUG1)
-          << op_type << " is registered more than once.";
-      return;
-    }
-    if (sizeof...(ARGS) == 0) {
-      LOG(paddle_mobile::kLOG_DEBUG1)
-          << "OperatorRegistrar should be invoked at least by OpClass";
-      return;
-    }
-    OpInfo<Dtype> info;
-    OperatorRegistrarRecursive<Dtype, 0, false, ARGS...>(op_type, &info);
-    OpInfoMap<Dtype>::Instance()->Insert(op_type, info);
-  }
-};
-
-template <typename Dtype, typename T>
-struct OpInfoFiller {
-  void operator()(const std::string& op_type, OpInfo<Dtype>* info) const {
-    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
-                        const VariableNameMap& outputs,
-                        const AttributeMap& attrs, framework::Scope* scope) {
-      return new T(type, inputs, outputs, attrs, scope);
-    };
-  }
-};
-
-template <typename Dtype, size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<Dtype, I, false, ARGS...> {
- public:
-  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
-  OperatorRegistrarRecursive(const std::string& op_type, OpInfo<Dtype>* info) {
-    OpInfoFiller<Dtype, T> fill;
-    fill(op_type, info);
-    constexpr auto size = sizeof...(ARGS);
-    OperatorRegistrarRecursive<Dtype, I + 1, I + 1 == size, ARGS...> reg(
-        op_type, info);
-    (void)(reg);
-  }
-};
-
-template <typename Dtype, size_t I, typename... ARGS>
-class OperatorRegistrarRecursive<Dtype, I, true, ARGS...> {
- public:
-  OperatorRegistrarRecursive(const std::string& op_type, OpInfo<Dtype>* info) {}
-};
-
-template <typename Dtype>
-class OpRegistry {
- public:
-  static std::shared_ptr<OperatorBase<Dtype>> CreateOp(
-      const std::string& type, const VariableNameMap& inputs,
-      const VariableNameMap& outputs, const AttributeMap attrs,
-      paddle_mobile::framework::Scope* scope) {
-    auto& info = OpInfoMap<Dtype>::Instance()->Get(type);
-    auto op = info.Creator()(type, inputs, outputs, attrs, scope);
-    return std::shared_ptr<OperatorBase<Dtype>>(op);
-  }
-};
-
-#define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
-  template class op_class<device_type, float>;                             \
-  template <typename Dtype, typename T>                                    \
-  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
-   public:                                                                 \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);   \
-  };                                                                       \
-  static paddle_mobile::framework::OperatorRegistrar<                      \
-      device_type, _OpClass_##op_type##_##device_name<device_type, float>> \
-      __op_registrar_##op_type##_##device_name(#op_type);                  \
-  int TouchOpRegistrar_##op_type##_##device_name() {                       \
-    __op_registrar_##op_type##_##device_name.Touch();                      \
-    return 0;                                                              \
-  }
-
-#define REGISTER_OPERATOR_CPU(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, cpu, paddle_mobile::CPU);
-
-#define REGISTER_OPERATOR_FPGA(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
-
-#define REGISTER_OPERATOR_CL(op_type, op_class) \
-  REGISTER_OPERATOR(op_type, op_class, cl, paddle_mobile::GPU_CL);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp
deleted file mode 100644
index a091a49b35203445cda48b2387413193079ecd5e..0000000000000000000000000000000000000000
--- a/mobile/src/framework/operator.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/operator.h"
-#include <memory>
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-vector<string> OperatorBase<Dtype>::GetOutKeys() const {
-  auto it = op_input_output_key.find(type_);
-  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no outputs";
-    return {};
-  }
-  return it->second.second;
-}
-
-template <typename Dtype>
-vector<string> OperatorBase<Dtype>::GetInputKeys() const {
-  auto it = op_input_output_key.find(type_);
-  if (it == op_input_output_key.end()) {
-    DLOG << type_ << " has no inputs";
-    return {};
-  }
-  return it->second.first;
-}
-
-template <typename Dtype>
-OperatorBase<Dtype>::OperatorBase(const std::string &type,
-                                  const VariableNameMap &inputs,
-                                  const VariableNameMap &outputs,
-                                  const AttributeMap &attrs,
-                                  framework::Scope *scope)
-    : type_(type),
-      inputs_(inputs),
-      outputs_(outputs),
-      attrs_(attrs),
-      scope_(scope) {
-  CheckAllInputOutputSet();
-}
-
-template <typename Dtype>
-void OperatorBase<Dtype>::CheckAllInputOutputSet() const {}
-
-template <typename Dtype>
-void OperatorBase<Dtype>::Run() {
-  RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
-  DLOG << "-------------" << type_ << "----------------------------";
-  vector<string> input_keys = GetInputKeys();
-  for (const auto key : input_keys) {
-    if (inputs_.count(key) > 0) {
-      auto var_vec_in = inputs_.at(key);
-      for (int i = 0; i < var_vec_in.size(); ++i) {
-        auto var = this->scope_->FindVar(var_vec_in[i]);
-        if (var->IsInitialized() &&
-            var->template IsType<framework::LoDTensor>()) {
-          const Tensor *tensor = var->template Get<framework::LoDTensor>();
-          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
-#ifdef PADDLE_MOBILE_FPGA
-          DLOG << var_vec_in[i];
-#endif
-        }
-      }
-    } else {
-      DLOG << "did not find key (" << key << ") in inputs_";
-    }
-  }
-  for (const auto key : GetOutKeys()) {
-    if (outputs_.count(key) > 0) {
-      auto var_vec_out = outputs_.at(key);
-      for (int i = 0; i < var_vec_out.size(); ++i) {
-        auto var = scope_->FindVar(var_vec_out[i]);
-        if (var->IsInitialized() &&
-            var->template IsType<framework::LoDTensor>()) {
-          const Tensor *tensor = var->template Get<framework::LoDTensor>();
-          if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
-#ifdef PADDLE_MOBILE_FPGA
-          DLOG << var_vec_out[i];
-#endif
-        }
-      }
-    } else {
-      DLOG << "did not find key (" << key << ") in outputs_";
-    }
-  }
-#endif
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-void OperatorBase<GPU_CL>::Run() {
-  RunImpl();
-#ifdef PADDLE_MOBILE_DEBUG
-  DLOG << "-------------" << type_ << "----------------------------";
-  vector<string> input_keys = GetInputKeys();
-  for (const auto key : input_keys) {
-    if (inputs_.count(key) > 0) {
-      auto var_vec_in = inputs_.at(key);
-      for (int i = 0; i < var_vec_in.size(); ++i) {
-        auto var = scope_->FindVar(var_vec_in[i]);
-        if (var->IsInitialized() &&
-            var->template IsType<framework::CLImage>()) {
-          const CLImage *cl_image = var->template Get<framework::CLImage>();
-          if (cl_image) {
-            DLOG << type_ << " input- " << key << "=" << *cl_image;
-          }
-        }
-      }
-    } else {
-      DLOG << "did not find key (" << key << ") in inputs_";
-    }
-  }
-  for (const auto key : GetOutKeys()) {
-    if (outputs_.count(key) > 0) {
-      auto var_vec_out = outputs_.at(key);
-      for (int i = 0; i < var_vec_out.size(); ++i) {
-        auto var = scope_->FindVar(var_vec_out[i]);
-        if (var->IsInitialized() &&
-            var->template IsType<framework::CLImage>()) {
-          const CLImage *cl_image = var->template Get<framework::CLImage>();
-          if (cl_image) {
-            DLOG << type_ << " output- " << key << "=" << *cl_image;
-          }
-        }
-      }
-    } else {
-      DLOG << "did not find key (" << key << ") in outputs_";
-    }
-  }
-#endif
-}
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-template <typename Dtype>
-void OperatorBase<Dtype>::InsertTensors() {
-  static int feed_num = 0;
-  static int fetch_num = 0;
-  if (type_ == "feed") {
-    auto new_name = string("feed") + std::to_string(feed_num++);
-    auto var = scope_->Var(new_name);
-    var->template GetMutable<framework::LoDTensor>();
-    inputs_.at("X") = {string(new_name)};
-  } else if (type_ == "fetch") {
-    auto new_name = string("fetch") + std::to_string(fetch_num++);
-    auto var = scope_->Var(new_name);
-    var->template GetMutable<framework::LoDTensor>();
-    outputs_.at("Out") = {string(new_name)};
-  }
-}
-#endif
-
-template class OperatorBase<CPU>;
-template class OperatorBase<FPGA>;
-template class OperatorBase<GPU_CL>;
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/operator.h b/mobile/src/framework/operator.h
deleted file mode 100644
index baffba97c25be306970785e83bfa2d0c911dfe52..0000000000000000000000000000000000000000
--- a/mobile/src/framework/operator.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "common/types.h"
-#include "common/variant.h"
-#include "framework/attribute.h"
-#include "framework/op_info.h"
-#include "framework/op_kernel_type.h"
-#include "framework/op_registry.h"
-#include "framework/program/block_desc.h"
-#include "framework/program/program-optimize/node.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_helper.h"
-#include "framework/cl/cl_scope.h"
-#endif
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename T>
-static T *GetVarValue(const std::string &key, const VariableNameMap &var_map,
-                      const Scope &scope) {
-  auto var_vec = var_map.at(key);
-  if (!var_vec.empty()) {
-    auto var = scope.FindVar(var_vec[0]);
-    return var->GetMutable<T>();
-  } else {
-    return nullptr;
-  }
-}
-
-template <typename Dtype>
-class OperatorBase {
- public:
-  OperatorBase(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs, const AttributeMap &attrs,
-               framework::Scope *scope);
-  virtual ~OperatorBase() {}
-
-  virtual void Init() = 0;
-  virtual void InferShape() const = 0;
-  virtual void Run();
-  virtual void RunImpl() = 0;
-
-  std::vector<std::string> GetOutKeys() const;
-  std::vector<std::string> GetInputKeys() const;
-
-  const VariableNameMap &Inputs() const { return inputs_; }
-  const VariableNameMap &Outputs() const { return outputs_; }
-  const std::string &Type() const { return type_; }
-  const AttributeMap &Attrs() const { return attrs_; }
-  void setPrePostType(int prePostType) { pre_post_type_ = prePostType; }
-
-  void ClearVariables(const std::vector<std::string> &var_names) const {
-    if (this->scope_) {
-      this->scope_->EraseVars(var_names);
-    }
-  }
-#ifdef PADDLE_MOBILE_FPGA
-  void InsertTensors();
-#endif
-
- protected:
-  framework::Scope *scope_;
-  std::string type_;
-  VariableNameMap inputs_;
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-  int pre_post_type_ = 0;
-
- private:
-  void CheckAllInputOutputSet() const;
-};
-
-template <typename Dtype, typename ParamType, typename KernelType>
-class OperatorWithKernel : public OperatorBase<Dtype> {
- public:
-  OperatorWithKernel(const std::string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     framework::Scope *scope)
-      : OperatorBase<Dtype>(type, inputs, outputs, attrs, scope),
-        param_(inputs, outputs, attrs, scope) {
-#ifdef PADDLE_MOBILE_CL
-    kernel_.InitCLHelper(scope->GetCLScpoe());
-#endif
-  }
-  virtual void RunImpl() { this->kernel_.Compute(this->param_); }
-
-  virtual void InferShape() const = 0;
-
-  void Init() {
-    if (this->pre_post_type_ != NONE_PRE_POST) {
-      kernel_.setPrePostType(this->pre_post_type_);
-    }
-    PADDLE_MOBILE_ENFORCE(kernel_.Init(&param_), "  %s kernel init failed",
-                          this->type_.c_str());
-  }
-
- protected:
-  KernelType kernel_;
-  ParamType param_;
-};
-
-template <typename Dtype, typename P>
-class OpKernelBase {
- public:
-  OpKernelBase() = default;
-
-#ifdef PADDLE_MOBILE_CL
-  virtual void InitCLHelper(CLScope *clScope) {
-    cl_helper_ = CLHelper(clScope);
-  }
-#endif
-
-  virtual void Compute(const P &para) = 0;
-  virtual bool Init(P *para) { return true; }
-  virtual ~OpKernelBase() = default;
-  virtual void setPrePostType(int prePostType) { pre_post_type_ = prePostType; }
-
- protected:
-#ifdef PADDLE_MOBILE_CL
-  CLHelper cl_helper_;
-#endif
-  int pre_post_type_ = 0;
-
- private:
-};
-
-class FusionOpMatcher {
- public:
-  FusionOpMatcher() {}
-
-  virtual std::string Type() = 0;
-
-  virtual void FolderNodes(
-      Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-
-  virtual Node &BeginNode() { return node_; }
-
-  std::string BeginType() { return node_.Type(); }
-
-  virtual std::vector<std::pair<int, std::string>> NeedCheck() { return {}; }
-
- protected:
-  Node node_;
-  std::string type_;
-  std::shared_ptr<OpDesc> new_opdesc_;
-};
-
-#define DECLARE_OPERATOR(OpName, OpParam, OpKernel)                           \
-  template <typename DeviceType, typename T>                                  \
-  class OpName##Op : public framework::OperatorWithKernel<                    \
-                         DeviceType, OpParam<DeviceType>,                     \
-                         operators::OpKernel<DeviceType, T>> {                \
-   public:                                                                    \
-    OpName##Op(const std::string &type, const VariableNameMap &inputs,        \
-               const VariableNameMap &outputs,                                \
-               const framework::AttributeMap &attrs, framework::Scope *scope) \
-        : framework::OperatorWithKernel<DeviceType, OpParam<DeviceType>,      \
-                                        operators::OpKernel<DeviceType, T>>(  \
-              type, inputs, outputs, attrs, scope) {}                         \
-                                                                              \
-    void InferShape() const override;                                         \
-  };
-
-#define DECLARE_KERNEL(OpName, OpParam)                                   \
-  template <typename DeviceType, typename T>                              \
-  class OpName##Kernel                                                    \
-      : public framework::OpKernelBase<DeviceType, OpParam<DeviceType>> { \
-   public:                                                                \
-    bool Init(OpParam<DeviceType> *param);                                \
-    void Compute(const OpParam<DeviceType> &param);                       \
-  };
-
-#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)                                 \
-  cls(const std::string &type, const ::paddle_mobile::VariableNameMap &inputs, \
-      const ::paddle_mobile::VariableNameMap &outputs,                         \
-      const ::paddle_mobile::framework::AttributeMap &attrs,                   \
-      ::paddle_mobile::framework::Scope *scope)                                \
-      : parent_cls<Dtype, T>(type, inputs, outputs, attrs, scope) {}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/block_desc.cpp b/mobile/src/framework/program/block_desc.cpp
deleted file mode 100644
index 4e3eb79d07d0c8c363a6c3a9556cf718ebdc08f2..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/block_desc.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "block_desc.h"
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace framework {
-
-std::vector<std::shared_ptr<VarDesc>> BlockDesc::Vars() const { return vars_; }
-
-std::vector<std::shared_ptr<OpDesc>> BlockDesc::Ops() const { return ops_; }
-
-BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
-    : index_(desc->idx), parent_index_(desc->idx) {
-  for (int i = 0; i < desc->n_vars; ++i) {
-    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_.emplace_back(std::shared_ptr<VarDesc>(new VarDesc(var_desc)));
-  }
-
-  std::sort(vars_.begin(), vars_.end(),
-            [](std::shared_ptr<VarDesc> left, std::shared_ptr<VarDesc> right) {
-              return left->Name() < right->Name();
-            });
-
-  for (int j = 0; j < desc->n_ops; ++j) {
-    PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
-    ops_.emplace_back(new framework::OpDesc(op_desc));
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/block_desc.h b/mobile/src/framework/program/block_desc.h
deleted file mode 100644
index 86dd832d1b108878313d5a34ada3ce5a2b9c321e..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/block_desc.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "framework/framework.pb-c.h"
-#include "framework/program/op_desc.h"
-#include "framework/program/var_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class BlockDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  BlockDesc() {}
-  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
-  explicit BlockDesc(const BlockDesc &block_desc)
-      : index_(block_desc.index_), parent_index_(block_desc.parent_index_) {
-    for (auto &op_desc : block_desc.ops_) {
-      std::shared_ptr<OpDesc> copy_op_desc = std::make_shared<OpDesc>(*op_desc);
-      ops_.push_back(copy_op_desc);
-    }
-
-    for (int i = 0; i < block_desc.vars_.size(); ++i) {
-      auto &var_desc = block_desc.vars_[i];
-      vars_.emplace_back(std::make_shared<VarDesc>(*var_desc));
-    }
-  }
-
-  const int &ID() const { return index_; }
-
-  const bool &MultiThread() const { return multi_thread_; }
-
-  const int &Parent() const { return parent_index_; }
-
-  bool operator==(const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
-  }
-
-  bool operator<(const paddle_mobile::framework::BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
-  }
-
-  std::vector<std::shared_ptr<VarDesc>> Vars() const;
-  std::vector<std::shared_ptr<OpDesc>> Ops() const;
-
- private:
-  int index_;
-  bool multi_thread_;
-  int parent_index_;
-  std::vector<std::shared_ptr<OpDesc>> ops_;
-  std::vector<std::shared_ptr<VarDesc>> vars_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-namespace std {
-
-template <>
-struct hash<paddle_mobile::framework::BlockDesc> {
-  typedef paddle_mobile::framework::BlockDesc argument_type;
-  typedef std::size_t result_type;
-  result_type operator()(argument_type const &s) const noexcept {
-    result_type const h1(std::hash<int>{}(s.ID()));
-    result_type const h2(std::hash<int>{}(s.ID()));
-    return h1 ^ (h2 << 1);
-  }
-};
-
-}  // namespace std
diff --git a/mobile/src/framework/program/op_desc.cpp b/mobile/src/framework/program/op_desc.cpp
deleted file mode 100644
index ba3105778eddc553b43bd30a1da020666c09f994..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/op_desc.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-
-#include "framework/program/op_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-OpDesc::OpDesc(PaddleMobile__Framework__Proto__OpDesc *desc) {
-  this->type_ = std::string(desc->type);
-  for (int i = 0; i < desc->n_inputs; ++i) {
-    PaddleMobile__Framework__Proto__OpDesc__Var *var = desc->inputs[i];
-    std::vector<std::string> &args = inputs_[std::string(var->parameter)];
-    for (int j = 0; j < var->n_arguments; ++j) {
-      args.emplace_back(std::string(var->arguments[j]));
-    }
-  }
-
-  for (int i = 0; i < desc->n_outputs; ++i) {
-    PaddleMobile__Framework__Proto__OpDesc__Var *var = desc->outputs[i];
-    std::vector<std::string> &args = outputs_[std::string(var->parameter)];
-    for (int j = 0; j < var->n_arguments; ++j) {
-      args.emplace_back(std::string(var->arguments[j]));
-    }
-  }
-
-  for (int k = 0; k < desc->n_attrs; ++k) {
-    PaddleMobile__Framework__Proto__OpDesc__Attr *attr = desc->attrs[k];
-    std::string attr_name(attr->name);
-    attrs_[attr_name] = Attribute::GetAttrValue(attr);
-    proto_attrs_.push_back(*attr);
-  }
-}
-
-const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
-    &OpDesc::GetProtoAttr() const {
-  return proto_attrs_;
-}
-
-const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
-  return inputs_.find(name)->second;
-}
-
-const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
-  return outputs_.find(name)->second;
-}
-
-Attribute OpDesc::GetAttr(const std::string &name) const {
-  auto it = attrs_.find(name);
-  return it->second;
-}
-
-void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) {
-  this->attrs_[name].Set<BlockDesc *>(block);
-}
-
-void OpDesc::SetBlocksAttr(const std::string &name,
-                           std::vector<BlockDesc *> blocks) {
-  this->attrs_[name].Set<std::vector<BlockDesc *>>(blocks);
-}
-
-std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() {
-  return attrs_;
-}
-
-Print &operator<<(Print &printer, const OpDesc &op_desc) {
-  OpDesc &no_const_op_desc = const_cast<OpDesc &>(op_desc);
-  printer << "inputs: \n";
-  for (const auto &input : no_const_op_desc.GetInputs()) {
-    printer << input.first << " : " << input.second << "\n";
-  }
-
-  printer << "outputs: \n";
-  for (const auto &output : no_const_op_desc.GetOutputs()) {
-    printer << output.first << " : " << output.second << "\n";
-  }
-
-  printer << "outputs: \n";
-  for (const auto &attr : no_const_op_desc.GetAttrMap()) {
-    printer << attr.first << " : " << attr.second << "\n";
-  }
-  return printer;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/op_desc.h b/mobile/src/framework/program/op_desc.h
deleted file mode 100644
index 89c877ba120db630f30dc21d48c0347b0668cb22..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/op_desc.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "common/log.h"
-#include "common/types.h"
-#include "framework/attribute.h"
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class OpDesc {
- public:
-  friend class ProgramOptimize;
-  friend class FusionOpMatcher;
-  friend class Node;
-
-  explicit OpDesc(PaddleMobile__Framework__Proto__OpDesc *op_desc);
-  OpDesc(const OpDesc &op_desc) : type_(op_desc.type_) {
-    this->inputs_ = op_desc.inputs_;
-    this->outputs_ = op_desc.outputs_;
-    this->attrs_ = op_desc.attrs_;
-    this->proto_attrs_ = op_desc.proto_attrs_;
-  }
-
-  OpDesc() {}
-  const std::vector<std::string> &Input(const std::string &name) const;
-  const std::vector<std::string> &Output(const std::string &name) const;
-  Attribute GetAttr(const std::string &name) const;
-
-  const std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr>
-      &GetProtoAttr() const;
-
-  void SetBlockAttr(const std::string &name, BlockDesc *block);
-  void SetBlocksAttr(const std::string &name, std::vector<BlockDesc *> block);
-
-  VariableNameMap &GetInputs() { return inputs_; }
-
-  VariableNameMap &GetOutputs() { return outputs_; }
-
-  AttributeMap &GetAttrMap();
-
-  const std::string &Type() { return type_; }
-
-  void SetInputs(VariableNameMap inputs) { inputs_ = inputs; }
-
-  void SetOutputs(VariableNameMap outputs) { outputs_ = outputs; }
-
-  void SetAttrMap(AttributeMap attrs) { attrs_ = attrs; }
-
- private:
-  std::string type_;
-  VariableNameMap inputs_;
-  VariableNameMap outputs_;
-  AttributeMap attrs_;
-  std::vector<PaddleMobile__Framework__Proto__OpDesc__Attr> proto_attrs_;
-};
-
-Print &operator<<(Print &printer, const OpDesc &op_desc);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/fusion_op_register.h b/mobile/src/framework/program/program-optimize/fusion_op_register.h
deleted file mode 100644
index 1bf04bd6ec894425dd5168e87db749026303e67c..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program-optimize/fusion_op_register.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/node.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class FusionOpRegister {
- public:
-  static FusionOpRegister* Instance() {
-    static FusionOpRegister* regist = nullptr;
-    if (regist == nullptr) {
-      regist = new FusionOpRegister();
-    }
-    return regist;
-  }
-
-  void regist(FusionOpMatcher* matcher) {
-    if (matchers_.find(matcher->Type()) != matchers_.end()) {
-      return;
-    }
-
-    std::shared_ptr<FusionOpMatcher> shared_matcher(matcher);
-    matchers_[matcher->Type()] = shared_matcher;
-  }
-
-  const std::vector<std::shared_ptr<FusionOpMatcher>> Matchers() {
-    std::vector<std::shared_ptr<FusionOpMatcher>> matchers;
-    for (const auto& match : matchers_) {
-      matchers.push_back(match.second);
-    }
-    std::sort(matchers.begin(), matchers.end(),
-              [](std::shared_ptr<FusionOpMatcher> first,
-                 std::shared_ptr<FusionOpMatcher> second) {
-                return first->BeginNode().Depth() > second->BeginNode().Depth();
-              });
-    return matchers;
-  }
-
- private:
-  std::map<std::string, std::shared_ptr<FusionOpMatcher>> matchers_;
-  FusionOpRegister() {}
-};
-
-class FusionOpRegistrar {
- public:
-  explicit FusionOpRegistrar(FusionOpMatcher* matcher) {
-    FusionOpRegister::Instance()->regist(matcher);
-  }
-  void Touch() {}
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-#define REGISTER_FUSION_MATCHER(op_type, matcher)          \
-  static paddle_mobile::framework::FusionOpRegistrar       \
-      __fusion_matcher_registrar_##op_type(new matcher()); \
-  int TouchFusionMatcherRegistrar_##op_type() {            \
-    __fusion_matcher_registrar_##op_type.Touch();          \
-    return 0;                                              \
-  }
diff --git a/mobile/src/framework/program/program-optimize/node.cpp b/mobile/src/framework/program/program-optimize/node.cpp
deleted file mode 100644
index 68bd89b768c8a9a78afa4f66fc80508dd915af85..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program-optimize/node.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/program/program-optimize/node.h"
-#include <algorithm>
-#include <map>
-#include <memory>
-#include "framework/operator.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-
-std::vector<Node *> Node::operator[](int index) {
-  std::vector<Node *> nodes;
-  GetNodesWithLocation(index, 0, &nodes);
-  return nodes;
-}
-
-void Node::GetNodesWithLocation(int index, int now_index,
-                                std::vector<Node *> *nodes) {
-  if (index == now_index) {
-    nodes->push_back(this);
-  }
-
-  for (int i = 0; i < this->outputs_.size(); ++i) {
-    this->outputs_[i]->GetNodesWithLocation(index, now_index + 1, nodes);
-  }
-}
-
-Node &Node::operator>(std::shared_ptr<Node> node) {
-  outputs_.push_back(node);
-  node->inputs_.push_back(this);
-  return *node;
-}
-
-bool Node::operator==(const Node &in) {
-  if (in.type_ == this->type_) {
-    if (this->outputs_.size() == in.outputs_.size()) {
-      for (int i = 0; i < outputs_.size(); ++i) {
-        if (!(this->outputs_[i]->MedianEqual(*in.outputs_[i]))) {
-          return false;
-        }
-      }
-    } else {
-      return false;
-    }
-  } else {
-    return false;
-  }
-  return true;
-}
-
-bool Node::MedianEqual(const Node &in) {
-  if (in.type_ == this->type_) {
-    if (this->outputs_.size() == in.outputs_.size()) {
-      //      if (this->inputs_.size() != in.inputs_.size()) {
-      //        DLOG << " == - this input size: " << this->inputs_.size();
-      //        DLOG << " == - ptr of this " << this;
-      //        DLOG << " == - in input size: " << in.inputs_.size();
-      //        DLOG << " == - input size not equal ";
-      //        return false;
-      //      } else {
-      //        for (int i = 0; i < this->inputs_.size(); ++i) {
-      //          if (this->inputs_[i]->type_ != in.inputs_[i]->type_) {
-      //            DLOG << " == - input type not equal ";
-      //            return false;
-      //          }
-      //        }
-      //      }
-
-      for (int i = 0; i < outputs_.size(); ++i) {
-        if (!((*outputs_[i]).MedianEqual(*in.outputs_[i]))) {
-          return false;
-        }
-      }
-    } else {
-      //      DLOG << " == - output size not equal ";
-      return false;
-    }
-  } else {
-    //    DLOG << " == - median type is not equal ";
-    return false;
-  }
-  return true;
-}
-
-std::map<std::string, Node *> Node::Relationship() {
-  std::map<std::string, Node *> map;
-  RelationshipPrivate(&map);
-  return map;
-}
-
-void Node::RelationshipPrivate(std::map<std::string, Node *> *map) {
-  for (auto output : op_desc_->outputs_) {
-    for (auto output_key : output.second) {
-      (*map)[output_key] = this;
-    }
-  }
-  for (auto output : this->outputs_) {
-    output->RelationshipPrivate(map);
-  }
-}
-
-std::shared_ptr<Node> Node::To(int size) {
-  std::shared_ptr<Node> node = std::make_shared<Node>();
-  this->To(size - 1, node);
-  return node;
-}
-
-void Node::To(int index, std::shared_ptr<Node> node) {
-  node->op_desc_ = this->op_desc_;
-  node->type_ = this->type_;
-  node->inputs_ = this->inputs_;
-  if (index != 0) {
-  } else {
-    return;
-  }
-
-  for (int j = 0; j < this->outputs_.size(); ++j) {
-    std::shared_ptr<Node> sub_node = std::make_shared<Node>();
-    node->outputs_.push_back(sub_node);
-    outputs_[j]->To(index - 1, sub_node);
-  }
-}
-
-int Node::Depth(int begin) {
-  int depth = 0;
-  begin++;
-  for (int i = 0; i < outputs_.size(); ++i) {
-    int output_depth = outputs_[i]->Depth(begin);
-    depth = output_depth > depth ? output_depth : depth;
-  }
-  return begin > depth ? begin : depth;
-}
-
-Node &Node::Folder(
-    int size, std::string type,
-    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-        change,
-    std::vector<std::shared_ptr<Node>> *removed_nodes) {
-  std::shared_ptr<framework::OpDesc> op_desc =
-      std::make_shared<framework::OpDesc>();
-  op_desc->inputs_ = this->op_desc_->inputs_;
-  std::vector<std::shared_ptr<Node>> outputs;
-  this->Folder(op_desc, &outputs, size - 1, &change, this, removed_nodes);
-  this->outputs_ = outputs;
-  this->type_ = type;
-  this->op_desc_ = op_desc;
-  this->op_desc_->type_ = type;
-  return *this;
-}
-
-void Node::Folder(
-    std::shared_ptr<framework::OpDesc> op_desc,
-    std::vector<std::shared_ptr<Node>> *outputs, int index,
-    std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-        *change,
-    Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes) {
-  if (change->find(this->type_) != change->end()) {
-    auto change_pairs = (*change)[this->type_];
-    for (const auto &change_pair : change_pairs) {
-      std::map<std::string, int> f;
-      if (this->op_desc_->GetInputs().find(change_pair.first) !=
-          this->op_desc_->GetInputs().end()) {
-        if (op_desc->GetInputs().find(change_pair.second) !=
-            op_desc->GetInputs().end()) {
-          for (auto value : this->op_desc_->GetInputs()[change_pair.first]) {
-            op_desc->GetInputs()[change_pair.second].push_back(value);
-          }
-        } else {
-          op_desc->GetInputs()[change_pair.second] =
-              this->op_desc_->GetInputs()[change_pair.first];
-        }
-      }
-    }
-  }
-
-  for (auto &attr_pair : this->op_desc_->attrs_) {
-    op_desc->attrs_.emplace(attr_pair.first, attr_pair.second);
-  }
-  if (index > 0) {
-    --index;
-
-    for (auto output : outputs_) {
-      if (change->find(this->type_) != change->end()) {
-        auto change_pairs = (*change)[this->type_];
-        for (const auto &change_pair : change_pairs) {
-          std::map<std::string, int> f;
-          if (this->op_desc_->GetOutputs().find(change_pair.first) !=
-              this->op_desc_->GetOutputs().end()) {
-            if (op_desc->GetInputs().find(change_pair.second) !=
-                op_desc->GetInputs().end()) {
-              for (auto value :
-                   this->op_desc_->GetOutputs()[change_pair.first]) {
-                op_desc->GetInputs()[change_pair.second].push_back(value);
-              }
-            } else {
-              op_desc->GetInputs()[change_pair.second] =
-                  this->op_desc_->GetOutputs()[change_pair.first];
-            }
-          }
-        }
-      }
-
-      removed_nodes->push_back(output);
-      output->Folder(op_desc, outputs, index, change, begin_node,
-                     removed_nodes);
-    }
-  } else {
-    for (auto &op_output : this->op_desc_->outputs_) {
-      auto output_key = op_output.first;
-      if (change->find(this->type_) != change->end()) {
-        const auto change_pairs = (*change)[this->type_];
-        for (const auto &target : change_pairs) {
-          if (target.first == output_key) {
-            output_key = target.second;
-          }
-        }
-      }
-      op_desc->outputs_.emplace(output_key, op_output.second);
-    }
-
-    for (auto &output : this->outputs_) {
-      auto iter =
-          std::find(output->inputs_.begin(), output->inputs_.end(), this);
-
-      if (iter != output->inputs_.end()) {
-        output->inputs_.erase(iter);
-      }
-      output->inputs_.push_back(begin_node);
-      outputs->push_back(output);
-    }
-  }
-}
-#ifdef PADDLE_MOBILE_DEBUG
-std::string Node::ToString(std::string blank, const Node *node) const {
-  std::stringstream ss;
-  ss << type_ << "-> \n";
-
-  if (inputs_.size() > 1 && node != inputs_.back()) {
-    return ss.str();
-  } else if (inputs_.size() > 1 && node == inputs_.back()) {
-    ss << "\n" << blank << type_ << "\n";
-  }
-
-  for (int i = 0; i < outputs_.size(); ++i) {
-    ss << blank << outputs_[i]->ToString(blank + "  ", this) << "";
-  }
-  return ss.str();
-}
-
-std::string Node::ToString() const { return this->ToString("  ", this); }
-
-void Node::Description() {
-  if (op_desc_.get()) {
-    DLOG << *op_desc_;
-  } else {
-    DLOG << " null ";
-  }
-}
-
-Print &operator<<(Print &printer, const Node &node) {
-  printer << node.ToString();
-  return printer;
-}
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/node.h b/mobile/src/framework/program/program-optimize/node.h
deleted file mode 100644
index 5b5ae7796f3c99f991100d3313563900fcf59250..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program-optimize/node.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cinttypes>
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "common/log.h"
-#include "framework/program/op_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Node {
-  friend class ProgramOptimize;
-
- public:
-  Node() {}
-  explicit Node(const std::string &type) : type_(type) {}
-  explicit Node(std::shared_ptr<OpDesc> op_desc)
-      : op_desc_(op_desc), type_(op_desc->Type()) {}
-  Node &operator>(std::shared_ptr<Node> node);
-  bool operator==(const Node &in);
-  bool MedianEqual(const Node &in);
-
-#ifdef PADDLE_MOBILE_DEBUG
-  std::string ToString() const;
-  void Description();
-#endif
-  std::shared_ptr<Node> To(int size);
-  int Depth(int begin = 0);
-  Node &Folder(
-      int size, std::string type,
-      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-          change,
-      std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::shared_ptr<framework::OpDesc> OpDescOfNode() { return op_desc_; }
-  std::string Type() { return type_; }
-
-  std::vector<Node *> operator[](int index);
-
-  std::map<std::string, Node *> Relationship();
-
- private:
-  void RelationshipPrivate(std::map<std::string, Node *> *map);
-  void GetNodesWithLocation(int index, int now_index,
-                            std::vector<Node *> *nodes);
-  void To(int index, std::shared_ptr<Node>);
-  void Folder(
-      std::shared_ptr<framework::OpDesc> op_desc,
-      std::vector<std::shared_ptr<Node>> *outputs, int index,
-      std::map<std::string, std::vector<std::pair<std::string, std::string>>>
-          *change,
-      Node *begin_node, std::vector<std::shared_ptr<Node>> *removed_nodes);
-  std::shared_ptr<framework::OpDesc> op_desc_;
-#ifdef PADDLE_MOBILE_DEBUG
-  std::string ToString(std::string blank, const Node *node) const;
-#endif
-  std::vector<std::shared_ptr<Node>> outputs_;
-  std::vector<Node *> inputs_;
-  std::string type_;
-};
-
-Print &operator<<(Print &printer, const Node &node);
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/program_optimize.cpp b/mobile/src/framework/program/program-optimize/program_optimize.cpp
deleted file mode 100644
index eba27314ad45ad91bee7b19ca8bdeab3ecbfb1fa..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program-optimize/program_optimize.cpp
+++ /dev/null
@@ -1,300 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/program/program-optimize/program_optimize.h"
-#include <algorithm>
-#include <utility>
-#include "framework/program/program-optimize/fusion_op_register.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-
-std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
-    std::shared_ptr<ProgramDesc> ori_des, bool add_split) {
-  std::shared_ptr<ProgramDesc> optimize_program =
-      std::make_shared<ProgramDesc>(*ori_des);
-  current_block_ = optimize_program->Blocks().size();
-
-  for (int i = 0; i < optimize_program->Blocks().size(); ++i) {
-    std::unordered_map<std::string, std::shared_ptr<Node>> output_nodes;
-    std::unordered_map<
-        std::string,
-        std::vector<
-            std::pair<std::shared_ptr<Node>,
-                      std::unordered_map<std::string, std::shared_ptr<Node>>>>>
-        type_map;
-    std::vector<std::shared_ptr<Node>> nodes;
-    std::shared_ptr<Node> begin_node;
-
-    auto block = optimize_program->Block(i);
-    for (int j = 0; j < block->Ops().size(); ++j) {
-      auto op = block->Ops()[j];
-      std::shared_ptr<Node> node = std::make_shared<Node>(op);
-      if (j == 0) {
-        begin_node = node;
-      }
-
-      const std::string op_type = op->Type();
-      nodes.push_back(node);
-      type_map[op_type].push_back({node, output_nodes});
-      const VariableNameMap &op_inputs = op->GetInputs();
-      const VariableNameMap &op_outpus = op->GetOutputs();
-
-      for (const auto &input : op_inputs) {
-        for (const auto &input_name : input.second) {
-          if (output_nodes.find(input_name) != output_nodes.end()) {
-            auto input_node = output_nodes[input_name];
-            *input_node > node;
-          }
-        }
-      }
-
-      for (const auto &output : op_outpus) {
-        for (const auto &output_name : output.second) {
-          output_nodes[output_name] = node;
-        }
-      }
-    }
-
-    for (auto &registed : FusionOpRegister::Instance()->Matchers()) {
-      std::string fusion_type = registed->Type();
-      std::shared_ptr<FusionOpMatcher> matcher = registed;
-
-      auto match_vector = type_map[matcher->BeginType()];
-
-      for (auto &match_node_pair : match_vector) {
-        auto match_node = match_node_pair.first;
-
-        auto node_has = match_node_pair.second;
-
-        auto depth = matcher->BeginNode().Depth();
-        auto sub_node = match_node->To(depth);
-        //  DLOG << " sub node: " << *sub_node;
-        if (*sub_node == matcher->BeginNode()) {
-          bool can_folder = true;
-
-          auto relationship_map = sub_node->Relationship();
-
-          for (auto to_check : matcher->NeedCheck()) {
-            auto nodes = (*sub_node)[to_check.first];
-            for (auto node : nodes) {
-              auto inputs_to_check =
-                  node->OpDescOfNode()->Input(to_check.second);
-
-              for (auto input_to_check : inputs_to_check) {
-                if (node_has.find(input_to_check) == node_has.end()) {
-                  if (relationship_map.find(input_to_check) ==
-                      relationship_map.end()) {
-                    can_folder = false;
-                  } else {
-                  }
-                }
-              }
-            }
-          }
-
-          if (!can_folder) {
-            continue;
-          }
-
-          std::vector<std::shared_ptr<Node>> removed_nodes;
-          matcher->FolderNodes(match_node.get(), &removed_nodes);
-          for (int k = removed_nodes.size() - 1; k >= 0; --k) {
-            auto removed_node = removed_nodes[k];
-            auto removed_ite =
-                std::find(nodes.begin(), nodes.end(), removed_node);
-            if (removed_ite != nodes.end()) {
-              nodes.erase(removed_ite);
-            }
-          }
-        }
-      }
-    }
-
-    std::vector<std::shared_ptr<framework::OpDesc>> op_descs;
-    if (add_split) {
-      GenerateOps(&op_descs, begin_node.get(), add_split);
-    } else {
-      for (int m = 0; m < nodes.size(); ++m) {
-        auto &node = nodes[m];
-        op_descs.push_back(node->op_desc_);
-      }
-    }
-    block->ops_ = op_descs;
-  }
-
-  for (int m = 0; m < new_blocks_.size(); ++m) {
-    std::shared_ptr<BlockDesc> new_block = new_blocks_[m];
-    new_block->index_ = m + ori_des->blocks_.size();
-    optimize_program->blocks_.push_back(new_block);
-  }
-  return optimize_program;
-}
-
-void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
-    Node *current_node) {
-  if (current_node->inputs_.size() > 1 &&
-      input_node != current_node->inputs_.back()) {
-    DLOG << " current type " << current_node->Type();
-
-    DLOG << " inputs size of current node > 0 ";
-
-    for (int i = 0; i < current_node->inputs_.size(); ++i) {
-      DLOG << " input i: " << current_node->inputs_[i]->Type();
-    }
-
-    return;
-  } else if (current_node->inputs_.size() > 1 &&
-             input_node == current_node->inputs_.back()) {
-    op_desc->push_back(current_node->op_desc_);
-  } else {
-    op_desc->push_back(current_node->op_desc_);
-  }
-
-  for (int i = 0; i < current_node->outputs_.size(); ++i) {
-    auto &output = current_node->outputs_[i];
-    GenerateOps(op_desc, current_node, output.get());
-  }
-}
-
-void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_desc, Node *input_node,
-    Node *current_node, bool adding_thread, int thread_num,
-    std::shared_ptr<BlockDesc> new_block) {
-  if (current_node->outputs_.size() > 1) {
-    adding_thread = false;
-  }
-
-  bool can_add_split = false;
-  const auto current_desc = current_node->OpDescOfNode();
-  const VariableNameMap &current_op_inputs = current_desc->GetInputs();
-  const VariableNameMap &current_op_outputs = current_desc->GetOutputs();
-  // 如果当前节点有多个输出 并且 只有当前节点对应的 op_desc_ 输出数为 1 时支持
-  if (current_node->outputs_.size() > 1 && current_op_outputs.size() == 1) {
-    can_add_split = true;
-
-    // 遍历当前节点的 output 节点
-    for (const auto &output : current_node->outputs_) {
-      // 不支持 output 有多个 output 的情况
-      if (output->outputs_.size() > 1) {
-        DLOG << "don't support multi output of output";
-        can_add_split = false;
-        break;
-      }
-
-      //与节点关联的 OpDesc
-      std::shared_ptr<framework::OpDesc> &op_desc = output->op_desc_;
-      //获取这个 op 的 inputs key 和 outputs key
-      const VariableNameMap &op_inputs = op_desc->GetInputs();
-      const VariableNameMap &op_outputs = op_desc->GetOutputs();
-
-      //判断现在 是否存在这个 op
-      //判断这个 output 和 input key 的 size 等于 1
-      if (op_outputs.size() == 1 && op_inputs.size() == 1) {
-        auto inputs_of_output = op_inputs.begin()->second;
-        auto outputs_of_output = op_outputs.begin()->second;
-
-        // 判断一下, 如果输入和输出没有同名, 是支持的
-        for (int i = 0; i < inputs_of_output.size(); ++i) {
-          std::string input_of_output = inputs_of_output[i];
-          for (int j = 0; j < outputs_of_output.size(); ++j) {
-            std::string output_of_output = outputs_of_output[j];
-            if (input_of_output == output_of_output) {
-              DLOG << "output的 output 包含 input" << input_of_output;
-              can_add_split = false;
-              break;
-            }
-          }
-        }
-      } else {  // 如果模型中包含没有的 op, 则不支持添加 split
-        DLOG << "找不到 这个 op 类型: " << output->op_desc_->Type();
-        can_add_split = false;
-      }
-    }
-  }
-
-  if (current_node->inputs_.size() > 1 &&
-      input_node != current_node->inputs_.back()) {
-    return;
-  } else if (current_node->inputs_.size() > 1 &&
-             input_node == current_node->inputs_.back()) {
-    new_block.reset();
-    adding_thread = false;
-    op_desc->push_back(current_node->op_desc_);
-  } else {
-    if (new_block.get() && adding_thread) {
-      new_block->ops_.push_back(current_node->op_desc_);
-    } else {
-      op_desc->push_back(current_node->op_desc_);
-    }
-  }
-  if (adding_thread) {
-    Attribute attr;
-    attr.Set<int>(thread_num);
-    current_node->op_desc_->attrs_["thread"] = attr;
-  }
-
-  if (can_add_split) {
-    new_block = std::make_shared<BlockDesc>();
-    new_block->multi_thread_ = true;
-    new_block->index_ = current_block_;
-    new_blocks_.push_back(new_block);
-
-    adding_thread = true;
-    std::shared_ptr<OpDesc> split_op_desc = std::make_shared<OpDesc>();
-    split_op_desc->type_ = G_OP_TYPE_SPLIT;
-    auto outputs = current_node->op_desc_->Output(
-        op_input_output_key[current_node->op_desc_->Type()].second[0]);
-    split_op_desc->inputs_ = {
-        {op_input_output_key[G_OP_TYPE_SPLIT].first[0], outputs}};
-    auto &split_outputs =
-        split_op_desc->outputs_[op_input_output_key[G_OP_TYPE_SPLIT].second[0]];
-    for (const auto &output : current_node->outputs_) {
-      split_outputs.push_back(outputs[0]);
-    }
-
-    Attribute attr;
-    attr.Set<int>(current_block_);
-    split_op_desc->attrs_["block_id"] = attr;
-
-    op_desc->push_back(split_op_desc);
-    current_block_++;
-  }
-
-  for (int i = 0; i < current_node->outputs_.size(); ++i) {
-    auto &output = current_node->outputs_[i];
-    if (can_add_split) {
-      GenerateOps(op_desc, current_node, output.get(), adding_thread, i,
-                  new_block);
-    } else {
-      GenerateOps(op_desc, current_node, output.get(), adding_thread,
-                  thread_num, new_block);
-    }
-  }
-}
-
-void ProgramOptimize::GenerateOps(
-    std::vector<std::shared_ptr<framework::OpDesc>> *op_descs, Node *begin_node,
-    bool can_add_split) {
-  if (can_add_split) {
-    this->GenerateOps(op_descs, begin_node, begin_node, false, -1, nullptr);
-  } else {
-    this->GenerateOps(op_descs, begin_node, begin_node);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program-optimize/program_optimize.h b/mobile/src/framework/program/program-optimize/program_optimize.h
deleted file mode 100644
index 57b282926d443fb11db86169ecb46a6724a88829..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program-optimize/program_optimize.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/node.h"
-#include "framework/program/program_desc.h"
-
-namespace paddle_mobile {
-
-namespace framework {
-class ProgramOptimize {
- public:
-  ProgramOptimize() {}
-  std::shared_ptr<ProgramDesc> FusionOptimize(
-      std::shared_ptr<ProgramDesc> ori_des, bool add_split = false);
-
- private:
-  int current_block_;
-  std::vector<std::shared_ptr<BlockDesc>> new_blocks_;
-  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_descs,
-                   Node *begin_node, bool can_add_split);
-  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *input_node, Node *current_node);
-  void GenerateOps(std::vector<std::shared_ptr<framework::OpDesc>> *op_desc,
-                   Node *input_node, Node *current_node, bool adding_thread,
-                   int thread_num, std::shared_ptr<BlockDesc> new_block);
-};
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program.h b/mobile/src/framework/program/program.h
deleted file mode 100644
index b6d1d96279a517056ccfda1b358625aa7c4987f5..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "common/types.h"
-#include "framework/program/program_desc.h"
-#include "framework/scope.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Device, typename T = float>
-class Program {
- public:
-  std::shared_ptr<ProgramDesc> originProgram;
-  std::shared_ptr<ProgramDesc> optimizeProgram;
-  std::shared_ptr<Scope> scope;
-  std::string model_path;
-  std::string para_path;
-  bool combined = false;
-  bool quantification = false;
-  size_t combined_params_len;
-  uint8_t *combined_params_buf;
-  int quantification_fold = 1;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program_desc.cpp b/mobile/src/framework/program/program_desc.cpp
deleted file mode 100644
index a75bf01be15133f0c54de64aa03c06f52bf751a7..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program_desc.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-
-#include "framework/program/program_desc.h"
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
-  for (int i = 0; i < desc->n_blocks; ++i) {
-    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
-  }
-  for (auto &block : blocks_) {
-    for (auto op : block->Ops()) {
-      for (const auto &attr : op->GetProtoAttr()) {
-        if (attr.type == PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK) {
-          size_t blk_idx = attr.block_idx;
-          op->SetBlockAttr(attr.name, this->MutableBlock(blk_idx));
-        } else if (attr.type ==
-                   PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS) {
-          size_t n_blocks_idx = attr.n_blocks_idx;
-          int32_t *blks_idx = attr.blocks_idx;
-          std::vector<BlockDesc *> block_descs;
-          for (size_t i = 0; i < n_blocks_idx; ++i) {
-            block_descs.push_back(this->MutableBlock(blks_idx[i]));
-          }
-          op->SetBlocksAttr(attr.name, block_descs);
-        }
-      }
-    }
-  }
-}
-
-void ProgramDesc::Description(std::string header) const {
-#ifdef PADDLE_MOBILE_DEBUG
-  if (header.size()) {
-    LOG(kLOG_INFO) << header;
-  }
-  for (int i = 0; i < this->blocks_.size(); ++i) {
-    auto block = this->blocks_[i];
-    for (int j = 0; j < block->Ops().size(); ++j) {
-      std::shared_ptr<OpDesc> op_desc = block->Ops()[j];
-      auto op_info_ptr =
-          OpInfoMap<CPU>::Instance()->GetNullable(op_desc->Type());
-      if (op_info_ptr == nullptr) {
-        DLOG << "Operator has not been registered :" << op_desc->Type().c_str();
-      }
-    }
-  }
-
-  for (int i = 0; i < this->blocks_.size(); ++i) {
-    auto block = this->blocks_[i];
-    LOG(kLOG_DEBUG) << "block: " << block->ID();
-    LOG(kLOG_INFO) << "block ops size: " << block->Ops().size();
-    for (int j = 0; j < block->Ops().size(); ++j) {
-      auto op = block->Ops()[j];
-      LOG(kLOG_DEBUG1) << j << "th, op: " << op->Type();
-      for (auto &input : op->GetInputs()) {
-        LOG(kLOG_DEBUG2) << "input parameter: " << input.first;
-        for (auto &n : input.second) {
-          LOG(kLOG_DEBUG3) << "argument - " << n;
-        }
-      }
-      for (auto &output : op->GetOutputs()) {
-        LOG(kLOG_DEBUG2) << "output parameter: " << output.first;
-        for (auto &n : output.second) {
-          LOG(kLOG_DEBUG3) << "argument - " << n;
-        }
-      }
-      for (auto &attr : op->GetAttrMap()) {
-        if (attr.first == "op_callstack" || attr.first == "sub_block") continue;
-        LOG(kLOG_DEBUG2) << "attr name: " << attr.first;
-        LOG(kLOG_DEBUG3) << "argument - " << attr.second;
-      }
-    }
-
-    for (const auto &var_desc : block->Vars()) {
-      LOG(kLOG_DEBUG1) << "var name: " << var_desc->Name();
-      if (var_desc->Type() == VARTYPE_TYPE_LOD_TENSOR) {
-        const TensorDesc &tensor_desc = var_desc->Tensor_desc();
-
-        LOG(kLOG_DEBUG2) << "in var tensor desc dims size: "
-                         << tensor_desc.Dims().size();
-        for (int l = 0; l < tensor_desc.Dims().size(); ++l) {
-          LOG(kLOG_DEBUG3) << "var tensor desc dim " << l
-                           << " value: " << tensor_desc.Dims()[l];
-        }
-      }
-    }
-  }
-
-  for (const auto &block : this->blocks_) {
-  }
-#endif
-}
-
-std::shared_ptr<BlockDesc> ProgramDesc::Block(size_t idx) {
-  return blocks_[idx];
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/program_desc.h b/mobile/src/framework/program/program_desc.h
deleted file mode 100644
index f4551509ee2846e96e8e9b672a22b9de673658ab..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/program_desc.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "common/types.h"
-#include "framework/framework.pb-c.h"
-#include "framework/program/block_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class ProgramDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
-
-  ProgramDesc(const ProgramDesc &program_desc) {
-    for (auto &block : program_desc.blocks_) {
-      std::shared_ptr<BlockDesc> copy_block =
-          std::make_shared<BlockDesc>(*block);
-      blocks_.push_back(copy_block);
-    }
-  }
-
-  std::shared_ptr<BlockDesc> Block(size_t idx);
-
-  BlockDesc *MutableBlock(size_t idx) {
-    if (idx == -1) {
-      return nullptr;
-    } else {
-      return blocks_[idx].get();
-    }
-  }
-
-  const std::vector<std::shared_ptr<BlockDesc>> &Blocks() const {
-    return blocks_;
-  }
-
-  void Description(std::string header = "") const;
-
- private:
-  std::vector<std::shared_ptr<BlockDesc>> blocks_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/tensor_desc.h b/mobile/src/framework/program/tensor_desc.h
deleted file mode 100644
index f1634c6503516551fb1986d5b64ba1a2638148e6..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/tensor_desc.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "framework/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-enum VarType_Type {
-  VARTYPE_TYPE_BOOL = 0,
-  VARTYPE_TYPE_INT16 = 1,
-  VARTYPE_TYPE_INT32 = 2,
-  VARTYPE_TYPE_INT64 = 3,
-  VARTYPE_TYPE_FP16 = 4,
-  VARTYPE_TYPE_FP32 = 5,
-  VARTYPE_TYPE_FP64 = 6,
-  VARTYPE_TYPE_LOD_TENSOR = 7,
-  VARTYPE_TYPE_SELECTED_ROWS = 8,
-  VARTYPE_TYPE_FEED_MINIBATCH = 9,
-  VARTYPE_TYPE_FETCH_LIST = 10,
-  VARTYPE_TYPE_STEP_SCOPES = 11,
-  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
-  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
-  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
-  VARTYPE_TYPE_READER = 15,
-  VARTYPE_TYPE_CHANNEL = 16,
-  VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18,
-  VARTYPE_TYPE_SIZE_T = 19,
-  VARTYPE_TYPE_UINT8 = 20,
-  VARTYPE_TYPE_INT8 = 21,
-};
-
-class TensorDesc {
- public:
-  TensorDesc() = default;
-  TensorDesc(const TensorDesc &desc) {
-    this->dims_ = desc.dims_;
-    this->data_type_ = desc.data_type_;
-  }
-
-  TensorDesc(PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
-    for (int i = 0; i < desc->n_dims; ++i) {
-      int64_t d = desc->dims[i];
-      dims_.emplace_back(d);
-    }
-    data_type_ = (VarType_Type)desc->data_type;
-  }
-  // return tensor dim as a vector
-  std::vector<int64_t> Dims() const { return dims_; };
-  // return tensor data type
-  VarType_Type DataType() const { return data_type_; }
-
- private:
-  std::vector<int64_t> dims_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/program/var_desc.h b/mobile/src/framework/program/var_desc.h
deleted file mode 100644
index ede7263a7250747b7a777e894735c6818903dfd0..0000000000000000000000000000000000000000
--- a/mobile/src/framework/program/var_desc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "framework/framework.pb-c.h"
-#include "framework/program/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class VarDesc {
- public:
-  VarDesc(const VarDesc &var_desc) {
-    this->data_type_ = var_desc.data_type_;
-    this->name_ = var_desc.name_;
-    this->persistable_ = var_desc.persistable_;
-    this->tensor_desc_ = var_desc.tensor_desc_;
-    this->type_ = var_desc.type_;
-  }
-
-  VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
-    type_ = (VarType_Type)desc->type->type;
-    name_ = std::string(desc->name);
-    persistable_ = (bool)desc->persistable;
-
-    switch (type_) {
-      case VARTYPE_TYPE_SELECTED_ROWS:
-        tensor_desc_ = TensorDesc(desc->type->selected_rows);
-        break;
-      case VARTYPE_TYPE_LOD_TENSOR:
-        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
-        break;
-      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
-        break;
-      default:
-        break;
-    }
-    switch (type_) {
-      case VARTYPE_TYPE_CHANNEL:
-        data_type_ = (VarType_Type)desc->type->channel->data_type;
-        break;
-      default:
-        data_type_ = tensor_desc_.DataType();
-        break;
-    }
-  }
-
-  std::string Name() const { return name_; }
-
-  VarType_Type Type() const { return type_; }
-
-  bool Persistable() const { return persistable_; }
-
-  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
-
- private:
-  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/scope.cpp b/mobile/src/framework/scope.cpp
deleted file mode 100644
index e60148f3c627aa96ed5aee4bdfd4a54d4b0cdc92..0000000000000000000000000000000000000000
--- a/mobile/src/framework/scope.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/scope.h"
-
-#include <algorithm>
-#include <set>
-#include <string>
-#include <vector>
-
-namespace paddle_mobile {
-namespace framework {
-
-Scope &Scope::NewScope() const {
-  kids_.push_back(new Scope(this));
-  return *kids_.back();
-}
-
-Variable *Scope::Var() {
-  auto *pvar = new Variable;
-  unnamed_vars_.push_back(pvar);
-  return pvar;
-}
-
-Variable *Scope::Var(const std::string &name) {
-  auto *pvar = FindVarLocally(name);
-  if (pvar != nullptr) {
-    return pvar;
-  }
-  pvar = new Variable;
-  named_vars_[name] = pvar;
-  pvar->name_ = named_vars_.find(name)->first;
-  return pvar;
-}
-
-Variable *Scope::FindVar(const std::string &name) const {
-  auto *pvar = FindVarLocally(name);
-  if (pvar != nullptr) {
-    return pvar;
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
-}
-
-const Scope *Scope::FindScope(const Variable *var) const {
-  for (auto &name_var : named_vars_) {
-    if (name_var.second == var) {
-      return this;
-    }
-  }
-  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
-}
-
-void Scope::DropKids() {
-  for (Scope *s : kids_) {
-    delete s;
-  }
-  kids_.clear();
-}
-
-std::vector<std::string> Scope::LocalVarNames() const {
-  std::vector<std::string> known_vars;
-  known_vars.reserve(named_vars_.size());
-  for (auto &name_var : named_vars_) {
-    known_vars.emplace_back(name_var.first);
-  }
-  return known_vars;
-}
-
-void Scope::DeleteScope(Scope *scope) const {
-  auto it = std::find(kids_.begin(), kids_.end(), scope);
-  kids_.erase(it);
-  delete scope;
-}
-
-void Scope::EraseVars(const std::vector<std::string> &var_names) {
-  std::set<std::string> var_set(var_names.begin(), var_names.end());
-  for (auto it = named_vars_.begin(); it != named_vars_.end();) {
-    if (var_set.find(it->first) != var_set.end()) {
-      delete it->second;
-      it = named_vars_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-void Scope::Rename(const std::string &origin_name,
-                   const std::string &new_name) const {
-  auto origin_it = named_vars_.find(origin_name);
-  if (origin_it == named_vars_.end()) {
-    return;
-  }
-  auto new_it = named_vars_.find(new_name);
-  if (new_it != named_vars_.end()) {
-    return;
-  }
-  named_vars_[new_name] = origin_it->second;
-  named_vars_.erase(origin_it);
-}
-
-Variable *Scope::FindVarLocally(const std::string &name) const {
-  auto it = named_vars_.find(name);
-  if (it != named_vars_.end()) {
-    return it->second;
-  }
-  return nullptr;
-}
-
-#ifdef PADDLE_MOBILE_FPGA
-Variable *Scope::Var(const std::string &name, const int id) {
-  return Var(name + std::to_string(id));
-}
-
-std::vector<Variable *> Scope::VarContain(const std::string substring,
-                                          int *min) {
-  std::vector<Variable *> v;
-
-  int temp = 9999;
-  auto len0 = substring.length();
-  for (auto pair : named_vars_) {
-    if (pair.first.find(substring) == 0) {
-      v.push_back(pair.second);
-      auto len1 = pair.first.length();
-      int index = std::stoi(pair.first.substr(len0, len1));
-      if (index < temp) {
-        temp = index;
-      }
-    }
-  }
-  *min = temp;
-  return v;
-}
-
-void Scope::print_vars() {
-  DLOG << "====================start to print variables=================";
-  for (auto pair : named_vars_) {
-    DLOG << pair.first;
-  }
-  DLOG << "==================complete printing variables================";
-}
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/scope.h b/mobile/src/framework/scope.h
deleted file mode 100644
index 47642cc3f1bff018dea3cfeb3936ede5b74f1206..0000000000000000000000000000000000000000
--- a/mobile/src/framework/scope.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <list>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_scope.h"
-#endif
-#include "framework/variable.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Scope {
- public:
-  Scope() = default;
-
-  ~Scope() {
-    // clear named variables
-    for (auto &var : named_vars_) {
-      delete var.second;
-    }
-    named_vars_.clear();
-    // clear unnamed variables
-    for (auto &var : unnamed_vars_) {
-      delete var;
-    }
-    unnamed_vars_.clear();
-    DropKids();
-
-#ifdef PADDLE_MOBILE_CL
-    delete cl_scope_;
-#endif
-  }
-
-  Scope &NewScope() const;
-
-  /// Create a variable without name if it doesn't exist.
-  Variable *Var();
-
-  /// Create a variable with given name if it doesn't exist.
-  Variable *Var(const std::string &name);
-
-  void EraseVars(const std::vector<std::string> &var_names);
-
-  /// Find a variable in the scope or any of its ancestors.  Returns
-  /// nullptr if cannot find.
-  Variable *FindVar(const std::string &name) const;
-
-  const Scope *parent() const { return parent_; }
-
-  /// Find the scope or an ancestor scope that contains the given
-  /// variable.
-  const Scope *FindScope(const Variable *var) const;
-
-  void DeleteScope(Scope *scope) const;
-
-  /// Drop all kids scopes belonged to this scope.
-  void DropKids();
-
-  // enumerate all the variables current contains.
-  std::vector<std::string> LocalVarNames() const;
-
-  // Rename variable to a new name
-  void Rename(const std::string &origin_name,
-              const std::string &new_name) const;
-
-  // Rename variable to a new name and return the new name
-  std::string Rename(const std::string &origin_name) const;
-
-  Variable *FindVarLocally(const std::string &name) const;
-
-#ifdef PADDLE_MOBILE_FPGA
-  Variable *Var(const std::string &name, const int id);
-  std::vector<Variable *> VarContain(const std::string substring, int *min);
-  void print_vars();
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-  CLScope *GetCLScpoe() { return cl_scope_; }
-#endif
-
- private:
-  // Call Scope::NewScope for a sub-scope.
-  explicit Scope(Scope const *parent) : parent_(parent) {}
-
-  mutable std::unordered_map<std::string, Variable *> named_vars_;
-  mutable std::vector<Variable *> unnamed_vars_;
-  mutable std::list<Scope *> kids_;
-  Scope const *parent_{nullptr};
-
-#ifdef PADDLE_MOBILE_CL
-  CLScope *cl_scope_ = new CLScope();
-#endif
-};
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/selected_rows.cpp b/mobile/src/framework/selected_rows.cpp
deleted file mode 100644
index 96e72051e5bf882c3549fb94cd8119ffc4fdfb9c..0000000000000000000000000000000000000000
--- a/mobile/src/framework/selected_rows.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/selected_rows.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-struct ReAllocateVisitor {
-  ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims)
-      : tensor_(tensor), dims_(dims) {}
-
-  template <typename T>
-  void operator()() const {
-    framework::Tensor cpu_tensor;
-    T* ptr = cpu_tensor.mutable_data<T>(dims_);
-    const T* old_ptr =
-        tensor_->memory_size() == 0 ? nullptr : tensor_->data<T>();
-    if (old_ptr != nullptr) {
-      std::copy(old_ptr, old_ptr + tensor_->numel(), ptr);
-    }
-    tensor_->ShareDataWith(cpu_tensor);
-  }
-
-  framework::Tensor* tensor_;
-  framework::DDim dims_;
-};
-// TensorCopyVisitor(value, i * value_width, *value_.get(),
-//    index * value_width, value_width));
-struct TensorCopyVisitor {
-  TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset,
-                    const framework::Tensor src, int64_t src_offset,
-                    int64_t size)
-      : dst_(dst),
-        dst_offset_(dst_offset),
-        src_(src),
-        src_offset_(src_offset),
-        size_(size) {}
-
-  template <typename T>
-  void operator()() const {
-    // TODO(Yancey1989): support other place
-    memory::Copy(dst_->mutable_data<T>() + dst_offset_,
-                 src_.data<T>() + src_offset_, size_ * sizeof(T));
-  }
-
-  framework::Tensor* dst_;
-  int64_t dst_offset_;
-  framework::Tensor src_;
-  int64_t src_offset_;
-  int64_t size_;
-};
-
-bool SelectedRows::HasKey(int64_t key) const {
-  return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false
-                                                                   : true;
-}
-
-// std::vector<int64_t> SelectedRows::Get(std::vector<int64_t> keys,
-//                                       framework::Tensor* value) const {
-//  PADDLE_MOBILE_ENFORCE(value->IsInitialized(),
-//                 "The value tensor should be initialized.");
-//  std::vector<int64_t> non_keys;
-//  int64_t value_width = value_->numel() / value_->dims()[0];
-//  PADDLE_MOBILE_ENFORCE(value_width == value->numel() / value->dims()[0],
-//                    "output tensor should have the same shape with table "
-//                    "execpt the dims[0].");
-//
-//  for (size_t i = 0; i < keys.size(); ++i) {
-//    int64_t index = Index(keys[i]);
-//    if (index == -1) {
-//      non_keys.push_back(keys[i]);
-//    } else {
-//      framework::VisitDataType(
-//          framework::ToDataType(value_->type()),
-//          TensorCopyVisitor(value, i * value_width, *value_.get(),
-//                            index * value_width, value_width));
-//    }
-//  }
-//  return non_keys;
-//}
-
-// bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-//  PADDLE_MOBILE_ENFORCE(value.IsInitialized(), "The value should be
-//  initialized."); if (value_->IsInitialized()) {
-//    PADDLE_MOBILE_ENFORCE(
-//        value.type() == value_->type(),
-//        "The type of the value should be same with the original value");
-//  }
-//  PADDLE_MOBILE_ENFORCE(value.dims()[0] == static_cast<size_t>(1),
-//                    "The first dim of value should be 1.");
-//  auto index = Index(key);
-//  bool is_new_key = false;
-//  if (index == -1) {
-//    rows_.push_back(key);
-//    index = rows_.size() - 1;
-//    is_new_key = true;
-//    // whether need to resize the table
-//    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-//      auto dims = value_->dims();
-//      dims[0] = (dims[0] + 1) << 1;
-//      framework::VisitDataType(framework::ToDataType(value.type()),
-//                               ReAllocateVisitor(value_.get(), dims));
-//    }
-//  }
-//
-//  framework::VisitDataType(
-//      framework::ToDataType(value.type()),
-//      TensorCopyVisitor(value_.get(),
-//                        index * value_->numel() / value_->dims()[0], value,
-//                        static_cast<int64_t>(0), value.numel()));
-//  return is_new_key;
-//}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/selected_rows.h b/mobile/src/framework/selected_rows.h
deleted file mode 100644
index db49bd91159116883e5fcb148ef3ed012ec42e71..0000000000000000000000000000000000000000
--- a/mobile/src/framework/selected_rows.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-
-#include "framework/lod_tensor.h"
-#include "framework/mixed_vector.h"
-#include "framework/tensor.h"
-#include "memory/t_malloc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class SelectedRows {
-  /*
-   * @brief We can use the SelectedRows structure to reproduce a sparse table.
-   *  A sparse table is a key-value structure that the key is an `int64_t`
-   * number,
-   *  and the value is a Tensor which the first dimension is 0.
-   *  You can use the following interface to operate the sparse table, and you
-   * can find
-   *  some detail information from the comments of each interface:
-   *
-   *  HasKey(key), whether the sparse table has the specified key.
-   *  Set(key, value), set a key-value pair into the sparse table.
-   *  Get(keys, value*), get value by given key list and apply it to the given
-   * value pointer
-   *    with the specified offset.
-   *
-   */
- public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
-      : rows_(rows), height_(height) {
-    value_.reset(new Tensor());
-  }
-
-  SelectedRows() {
-    height_ = 0;
-    value_.reset(new Tensor());
-  }
-
-  // platform::Place place() const { return value_->place(); }
-
-  const Tensor& value() const { return *value_; }
-
-  Tensor* mutable_value() { return value_.get(); }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
-  const Vector<int64_t>& rows() const { return rows_; }
-
-  Vector<int64_t>* mutable_rows() { return &rows_; }
-
-  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
-
-  /*
-   * @brief wheter has the specified key in the table.
-   *
-   * @return true if the key is exists.
-   */
-  bool HasKey(int64_t key) const;
-
-  /*
-   * @brief Get value by the key list, if the
-   *
-   * @return a list of keys which does not exists in table
-   */
-  std::vector<int64_t> Get(std::vector<int64_t> keys,
-                           framework::Tensor* tensor) const;
-
-  /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
-   *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
-   *
-   */
-  bool Set(int64_t key, const Tensor& value);
-
-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
-
-  DDim GetCompleteDims() const {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return make_ddim(dims);
-  }
-
- private:
-  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
-  // SelectedRows are simply concated when adding together. Until a
-  // SelectedRows add a Tensor, will the duplicate rows be handled.
-  Vector<int64_t> rows_;
-  std::unique_ptr<Tensor> value_{nullptr};
-  int64_t height_;
-};
-
-/*
- * Serialize/Desiralize SelectedRows to std::ostream
- * You can pass ofstream or ostringstream to serilize to file
- * or to a in memory string. GPU tensor will be copied to CPU.
- */
-void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/tensor.h b/mobile/src/framework/tensor.h
deleted file mode 100644
index 7cab1408daae9e9c3f34e64d37a3dace054665ce..0000000000000000000000000000000000000000
--- a/mobile/src/framework/tensor.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <fstream>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "common/enforce.h"
-#include "framework/data_layout.h"
-#include "framework/tensor_base.h"
-#include "memory/t_malloc.h"
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-#include "framework/zynqmp/ztensor.hpp"
-#endif
-
-#ifndef PADDLE_MOBILE_FPGA_KD
-
-namespace paddle_mobile {
-namespace framework {
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-class LoDTensor;
-
-class Tensor : public TensorBase {
- public:
-  Tensor() {}
-  template <typename T>
-  Tensor(std::vector<T> input, DDim ddim) {
-    PADDLE_MOBILE_ENFORCE(
-        input.size() == framework::product(ddim),
-        "input vector'length should be equal to tensor's length");
-
-    auto input_ptr = mutable_data<T>(ddim);
-    for (int i = 0; i < input.size(); ++i) {
-      input_ptr[i] = input[i];
-    }
-  }
-
-  template <typename T>
-  Tensor(T *input, DDim ddim) {
-    // input pointer is allocated by external sources. can't calculate its
-    // length. PADDLE_MOBILE_ENFORCE(
-    //     (sizeof(input) / sizeof(input[0])) == framework::product(ddim),
-    //     "input vector'length should be equal to tensor's length");
-
-    Resize(ddim);
-    auto type = type_id<T>().hash_code();
-    int64_t size = numel() * SizeOfType(type);
-    holder_.reset(
-        new PlaceholderImpl(size, type, reinterpret_cast<uint8_t *>(input)));
-    holder_->set_type(type);
-    offset_ = 0;
-  }
-
-  Tensor(const Tensor &inTensor) {
-    this->dims_ = inTensor.dims_;
-    this->holder_ = inTensor.holder_;
-    this->offset_ = inTensor.offset_;
-  }
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get() || dims_ != src.dims()) {
-      *this = src;
-    }
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareHolderWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      holder_ = src.holder_;
-    }
-    return *this;
-  }
-
-  template <typename T>
-  inline T *mutable_data_new() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    const kTypeId_t type = type_id<T>().hash_code();
-
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() != size + offset_) {
-      if (holder_ == nullptr) {
-        holder_.reset(new PlaceholderImpl(size, type));
-      } else {
-        holder_->realloc(size);
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  inline void *mutable_data(const kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      if (holder_ == nullptr) {
-        holder_.reset(new PlaceholderImpl(size, type));
-      } else {
-        holder_->resize(size);
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(type_id<T>().hash_code()));
-  }
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data(DDim dims) {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   */
-  inline Tensor Slice(int begin_idx, int end_idx) const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
-                          "The start row index must be greater than 0.")
-    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
-                          "The end row index is out of bound.")
-    PADDLE_MOBILE_ENFORCE(
-        begin_idx < end_idx,
-        "The start row index must be lesser than the end row index")
-    if (dims_[0] == 1) {
-      return *this;
-    } else {
-      size_t base = numel() / dims_[0];
-      Tensor dst;
-      dst.holder_ = holder_;
-      DDim dst_dims = dims_;
-      dst_dims[0] = end_idx - begin_idx;
-      dst.Resize(dst_dims);
-      dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-      return dst;
-    }
-  }
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T *data() {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T *data() const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
- private:
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(size_t size, const kTypeId_t type)
-        : ptr_(static_cast<uint8_t *>(memory::Alloc(size)),
-               [](uint8_t *ptr) { memory::PODDeleter<uint8_t>()(ptr); }),
-          size_(size),
-          capatity_(size),
-          type_(type) {
-      PADDLE_MOBILE_ENFORCE(ptr_ != nullptr,
-                            "Insufficient memory to allocation");
-    }
-
-    PlaceholderImpl(size_t size, const kTypeId_t type, uint8_t *ptr)
-        : ptr_(ptr, [](uint8_t *ptr) {}),
-          size_(size),
-          capatity_(size),
-          type_(type) {
-      PADDLE_MOBILE_ENFORCE(ptr_ != nullptr,
-                            "Insufficient memory to allocation");
-    }
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const { return static_cast<void *>(ptr_.get()); }
-
-    virtual kTypeId_t type() const { return type_; }
-
-    virtual void set_type(const kTypeId_t type) { type_ = type; }
-
-    virtual void resize(size_t size) {
-      if (size > capatity_) {
-        capatity_ = size;
-        ptr_.reset(static_cast<uint8_t *>(memory::Alloc(capatity_)));
-      }
-      size_ = size;
-    }
-
-    virtual void realloc(size_t size) {
-      capatity_ = size;
-      ptr_.reset(static_cast<uint8_t *>(memory::Alloc(capatity_)));
-      size_ = size;
-    }
-
-    std::unique_ptr<uint8_t, std::function<void(uint8_t *)>> ptr_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    size_t capatity_;
-
-    /* the current type of memory */
-    kTypeId_t type_;
-  };
-
-#ifdef PADDLE_MOBILE_FPGA
- public:  // NOLINT
-  inline void reset_data_ptr(void *p) {
-    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
-  }
-  inline void set_type(const kTypeId_t type) { holder_->set_type(type); }
-  inline void *get_data() {
-    return (
-        void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get());  // NOLINT
-  }
-
-  inline void *init(const kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = 1 * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      holder_.reset(new PlaceholderImpl(size, type));
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  float scale[2];                 // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
-  void *external_data = nullptr;  // only used for Feed
-  LayoutType layout = LAYOUT_HWC;
-  int64_t fpga_data_num;
-#endif
-};
-
-#ifdef PADDLE_MOBILE_DEBUG
-inline Print &operator<<(Print &printer, const Tensor &tensor) {
-  printer << " dims: " << tensor.dims() << "\n";
-  int stride = tensor.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-#ifndef PADDLE_MOBILE_FPGA
-  for (int i = 0; i < tensor.numel(); i += stride) {
-    if (tensor.type() == type_id<float>()) {
-      printer << tensor.data<float>()[i] << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int64_t>()) {
-      printer << tensor.data<int64_t>()[i] << " ";
-    } else if (tensor.type() == type_id<int8_t>()) {
-      printer << static_cast<int>(tensor.data<int8_t>()[i]) << " ";
-    } else if (tensor.type() == type_id<int32_t>()) {
-      printer << tensor.data<int32_t>()[i] << " ";
-    }
-  }
-#endif
-  return printer;
-}
-
-#endif
-
-inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/framework/tensor_base.h b/mobile/src/framework/tensor_base.h
deleted file mode 100644
index 97135bda3960a6a9714141c359980156ffd5d968..0000000000000000000000000000000000000000
--- a/mobile/src/framework/tensor_base.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "common/types.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename... T>
-struct SizeOfTypeFunctor;
-
-template <typename T>
-struct SizeOfTypeFunctor<T> {
-  size_t operator()(const kTypeId_t type) const {
-    if (type_id<T>().hash_code() == type) {
-      return sizeof(T);
-    } else {
-      return 0UL;
-    }
-  }
-};
-
-template <>
-struct SizeOfTypeFunctor<> {
-  size_t operator()(const kTypeId_t type) const { return 0UL; }
-};
-
-template <typename HEAD, typename... TAIL>
-struct SizeOfTypeFunctor<HEAD, TAIL...> {
-  size_t operator()(const kTypeId_t type) const {
-    SizeOfTypeFunctor<HEAD> head;
-    size_t head_size = head(type);
-    if (head_size != 0) {
-      return head_size;
-    }
-    SizeOfTypeFunctor<TAIL...> tail;
-    return tail(type);
-  }
-};
-
-static inline size_t SizeOfType(const kTypeId_t type) {
-  SizeOfTypeFunctor<int8_t, uint8_t, int, half, float, double, int16_t, int64_t,
-                    bool, size_t>
-      functor;
-  size_t size = functor(type);
-
-  PADDLE_MOBILE_ENFORCE(size != 0UL, "Cannot get size of type %d", type);
-  return size;
-}
-
-class TensorBase {
- public:
-  virtual inline TensorBase &Resize(const DDim &dims) = 0;
-
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  /*! Return the dimensions of the memory block. */
-  inline const DDim &dims() const { return dims_; }
-
-  /*! Return the numel of the memory block. */
-  inline int64_t numel() const { return product(dims_); }
-
-  kTypeId_t type() const {
-    PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
-        "Tensor not initialized yet when Tensor::type() is called.")
-    return holder_->type();
-  }
-
-  // memory size returns the holding memory size in byte.
-  size_t memory_size() const {
-    return holder_ == nullptr ? 0UL : holder_->size() - offset_;
-  }
-
-  inline void check_memory_size() const {
-#ifdef PADDLE_MOBILE_FPGA
-    return;
-#endif
-    PADDLE_MOBILE_ENFORCE(
-        holder_ != nullptr,
-        "Tensor holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_MOBILE_ENFORCE(numel() * SizeOfType(type()) <= memory_size(),
-                          "Tensor's dims_ is out of bound. ");
-  }
-
- protected:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a
-   * template
-   *          parameter of Variable.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() = default;
-
-    virtual void *ptr() const = 0;
-
-    virtual size_t size() const = 0;
-
-    virtual kTypeId_t type() const = 0;
-
-    virtual void set_type(kTypeId_t type) = 0;
-
-    virtual void resize(size_t size) = 0;
-
-    virtual void realloc(size_t size) = 0;
-  };
-
-  /**
-   * @brief points to elements dimensions.
-   *
-   * @note dims_ do not indicate the memory block size.
-   */
-
-  DDim dims_;
-
-  /*! holds the memory block if allocated. */
-  std::shared_ptr<Placeholder> holder_;
-
-  /**
-   * @brief   A PlaceHolder may be shared by more than one tensor.
-   *
-   * @note    Some of them may be slices of the others. So the offset_
-   *          is introduced here to indicate the byte offset between
-   *          PlaceHolder::ptr_ and where the tensor data really
-   * begins.
-   */
-  size_t offset_ = 0;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/tensor_util.cpp b/mobile/src/framework/tensor_util.cpp
deleted file mode 100644
index 6722ec3e37b8219eee9e1b9913799b08d8f902bc..0000000000000000000000000000000000000000
--- a/mobile/src/framework/tensor_util.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "tensor_util.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-void TensorCopy(const Tensor &src, Tensor *dst) {
-  src.check_memory_size();
-  dst->Resize(src.dims());
-  auto src_ptr = src.data<void>();
-  auto dst_ptr = dst->mutable_data(src.type());
-  auto size = src.numel() * SizeOfType(src.type());
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/tensor_util.h b/mobile/src/framework/tensor_util.h
deleted file mode 100644
index 31fc5148c7c08bb0bb01ea19f7eaa97d2eb02123..0000000000000000000000000000000000000000
--- a/mobile/src/framework/tensor_util.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "framework/tensor.h"
-#include "memory/t_malloc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-void TensorCopy(const Tensor& src, Tensor* dst);
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst);
-
-template <typename T>
-void TensorFromVector(const std::vector<T>& src, Tensor* dst) {
-  auto src_ptr = static_cast<const void*>(src.data());
-  dst->Resize({static_cast<int64_t>(src.size())});
-  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>());
-  auto size = src.size() * sizeof(T);
-
-  memory::Copy(dst_ptr, src_ptr, size);
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/type_trait.h b/mobile/src/framework/type_trait.h
deleted file mode 100644
index d1a8e30522815cfa401de9d35ef8f564b749fa76..0000000000000000000000000000000000000000
--- a/mobile/src/framework/type_trait.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <common/types.h>
-#include <string>
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-struct DtypeTensorTrait {
-  // This is the type we obtained in variable.
-  typedef framework::LoDTensor gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::Tensor rtype;
-};
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-struct DtypeTensorTrait<GPU_CL> {
-  // This is the type we obtained in variable.
-  typedef framework::CLImage gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::CLImage rtype;
-};
-#endif
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/variable.h b/mobile/src/framework/variable.h
deleted file mode 100644
index 30486cb34721a06ef5babf5cc796302b23834617..0000000000000000000000000000000000000000
--- a/mobile/src/framework/variable.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include "common/variant.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class Variable {
- public:
-  template <typename T>
-  const T *Get() const {
-    return static_cast<const T *>(holder_->Ptr());
-  }
-
-  template <typename T>
-  const T GetValue() const {
-    if (type_id<T>().hash_code() == type_id<std::string>().hash_code()) {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "Please use getString to get an string (to avoid of an issue with "
-          "gcc "
-          "stl lib with string copy)");
-      exit(0);
-    }
-    return variant.Get<T>();
-  }
-
-  template <typename T>
-  void SetValue(T value) {
-    variant.Set<T>(value);
-  }
-
-  bool IsInitialized() const { return holder_ != nullptr; }
-
-  template <typename T>
-  T *GetMutable() {
-    if (!IsType<T>()) {
-      holder_.reset(new PlaceholderImp<T>(new T()));
-    }
-    return static_cast<T *>(holder_->Ptr());
-  }
-
-  template <typename T>
-  bool IsType() const {
-    return holder_ != nullptr && holder_->Type() == type_id<T>().hash_code();
-  }
-
-  void Clear() { holder_.reset(); }
-
-  kTypeId_t Type() const { return holder_->Type(); }
-
- private:
-  struct Placeholder {
-    Placeholder() = default;
-    virtual ~Placeholder() = default;
-
-    virtual kTypeId_t Type() const = 0;
-    virtual void *Ptr() const = 0;
-  };
-
-  template <typename T>
-  struct PlaceholderImp : public Placeholder {
-    explicit PlaceholderImp(T *ptr)
-        : ptr_(ptr), type_(type_id<T>().hash_code()) {}
-
-    kTypeId_t Type() const override { return type_; }
-    void *Ptr() const override { return static_cast<void *>(ptr_.get()); }
-
-    std::unique_ptr<T> ptr_;
-    kTypeId_t type_;
-  };
-
-  friend class Scope;
-
-  Variant<int, bool, std::string, float, double> variant;
-  std::unique_ptr<Placeholder> holder_;
-  std::string name_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/src/framework/zynqmp/ztensor.hpp b/mobile/src/framework/zynqmp/ztensor.hpp
deleted file mode 100644
index d68e43b6dc66738ef5bd2b99d9e08f3122f852b5..0000000000000000000000000000000000000000
--- a/mobile/src/framework/zynqmp/ztensor.hpp
+++ /dev/null
@@ -1,312 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <cstring>
-#include <fstream>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "common/enforce.h"
-#include "framework/data_layout.h"
-#include "framework/tensor_base.h"
-#include "memory/t_malloc.h"
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-
-#include "fpga/KD/tensor.hpp"
-
-namespace paddle_mobile {
-namespace framework {
-
-class LoDTensor;
-
-class Tensor : public TensorBase {
- public:
-  Tensor() {}
-  template <typename T>
-  Tensor(std::vector<T> input, DDim ddim) {
-    PADDLE_MOBILE_ENFORCE(
-        input.size() == framework::product(ddim),
-        "input vector'length should be equal to tensor's length");
-
-    auto input_ptr = mutable_data<T>(ddim);
-    for (int i = 0; i < input.size(); ++i) {
-      input_ptr[i] = input[i];
-    }
-  }
-
-  Tensor(const Tensor &inTensor) {
-    this->dims_ = inTensor.dims_;
-    this->holder_ = inTensor.holder_;
-    this->offset_ = inTensor.offset_;
-  }
-
-  /*! Resize the dimensions of the memory block. */
-  inline Tensor &Resize(const DDim &dims) {
-    dims_ = dims;
-    // TODO(chonwhite) resize holder?
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareDataWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      *this = src;
-    }
-    return *this;
-  }
-
-  /*! The internal of two tensors share the same memory block. */
-  inline Tensor &ShareHolderWith(const Tensor &src) {
-    src.check_memory_size();
-    if (holder_.get() != src.holder_.get()) {
-      holder_ = src.holder_;
-    }
-    return *this;
-  }
-
-  inline zynqmp::Tensor *zynqmpTensor() const {
-    PlaceholderImpl *holder = static_cast<PlaceholderImpl *>(holder_.get());
-    // mutable_data(holder->type());
-    return holder->tensor_;
-  }
-
-  inline void *mutable_data(const kTypeId_t type) {
-    if (holder_ != nullptr) {
-      holder_->set_type(type);
-    }
-    PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.")
-    int64_t size = numel() * SizeOfType(type);
-    if (holder_ == nullptr || holder_->size() < size + offset_) {
-      PlaceholderImpl *impl = nullptr;
-      if (holder_ == nullptr) {
-        std::cout << "holder null" << std::endl;
-        impl = new PlaceholderImpl(dims_, type);
-        holder_.reset(impl);
-      } else {
-        impl = static_cast<PlaceholderImpl *>(holder_.get());
-        std::cout << "holder reize" << std::endl;
-        // holder_->resize(size);
-      }
-      impl->resize(dims_, type);
-      offset_ = 0;
-    }
-    return reinterpret_cast<void *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
-  /**
-   * @brief   Return a pointer to mutable memory block.
-   * @note    If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data() {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    return reinterpret_cast<T *>(mutable_data(type_id<T>().hash_code()));
-  }
-
-  /**
-   * @brief     Return a pointer to mutable memory block.
-   *
-   * @param[in] dims    The dimensions of the memory block.
-   * @param[in] place   The place of the memory block.
-   *
-   * @note      If not exist, then allocation.
-   */
-  template <typename T>
-  inline T *mutable_data(DDim dims) {
-    static_assert(std::is_pod<T>::value, "T must be POD");
-    Resize(dims);
-    return mutable_data<T>();
-  }
-
-  /**
-   * @brief  Return a sub-tensor of the given tensor.
-   *
-   * @param[in] begin_idx   The index of the start row(inclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   * @param[in] end_idx     The index of the end row(exclusive) to
-   * slice.
-   *                        The index number begins from 0.
-   */
-  inline Tensor Slice(int begin_idx, int end_idx) const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(begin_idx >= 0,
-                          "The start row index must be greater than 0.")
-    PADDLE_MOBILE_ENFORCE(end_idx <= dims_[0],
-                          "The end row index is out of bound.")
-    PADDLE_MOBILE_ENFORCE(
-        begin_idx < end_idx,
-        "The start row index must be lesser than the end row index")
-    if (dims_[0] == 1) {
-      return *this;
-    } else {
-      size_t base = numel() / dims_[0];
-      Tensor dst;
-      dst.holder_ = holder_;
-      DDim dst_dims = dims_;
-      dst_dims[0] = end_idx - begin_idx;
-      dst.Resize(dst_dims);
-      dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
-      return dst;
-    }
-  }
-
-  /*! Return a pointer to mutable memory block. */
-  template <typename T>
-  inline T *data() {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<T *>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                 offset_);
-  }
-
-  /*! Return a pointer to constant memory block. */
-  template <typename T>
-  inline const T *data() const {
-    check_memory_size();
-    PADDLE_MOBILE_ENFORCE(
-        (std::is_same<T, void>::value ||
-         holder_->type() == type_id<T>().hash_code()),
-        "Tensor holds the wrong type, it holds %d, requested %d",
-        this->holder_->type(), type_id<T>().hash_code());
-
-    return reinterpret_cast<const T *>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
-
- private:
-  struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(DDim ddim, const kTypeId_t type) {
-      tensor_ = new zynqmp::Tensor();
-      type_ = type;
-      std::vector<int> v = framework::vectorize2int(ddim);
-
-      zynqmp::LayoutType layout_type = zynqmp::NCHW;
-      switch (v.size()) {
-        case 1:
-          layout_type = zynqmp::N;
-          break;
-        case 2:
-          layout_type = zynqmp::NC;
-          break;
-        case 3:
-          layout_type = zynqmp::NHW;
-          break;
-        case 4:
-          layout_type = zynqmp::NCHW;
-          break;
-      }
-      zynqmp::Shape input_shape(layout_type, v);
-
-      // for (int i = 0; i < v.size(); i++) {
-      //   std::cout << ":" << v[i] << std::endl;
-      // }
-      zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
-      tensor_->mutableData<float>(dtype, input_shape);
-    }
-
-    virtual size_t size() const { return size_; }
-
-    virtual void *ptr() const {
-      void *ptr = tensor_->data<void *>();
-      return ptr;
-    }
-
-    virtual kTypeId_t type() const { return type_; }
-
-    virtual void set_type(const kTypeId_t type) { type_ = type; }
-
-    virtual void resize(size_t size) {
-      if (size > capatity_) {
-        capatity_ = size;
-        // TODO(chonwhite) implement;
-      }
-      size_ = size;
-    }
-
-    virtual void realloc(size_t size) {
-      capatity_ = size;
-      // TODO(chonwhite) implement;
-      size_ = size;
-    }
-
-    void resize(DDim ddim, const kTypeId_t type) {
-      std::vector<int> v = framework::vectorize2int(ddim);
-
-      zynqmp::LayoutType layout_type = zynqmp::NCHW;
-      switch (v.size()) {
-        case 1:
-          layout_type = zynqmp::N;
-          break;
-        case 2:
-          layout_type = zynqmp::NC;
-          break;
-        case 3:
-          layout_type = zynqmp::NHW;
-          break;
-        case 4:
-          layout_type = zynqmp::NCHW;
-          break;
-      }
-      zynqmp::Shape input_shape(layout_type, v);
-      zynqmp::DataType dtype = type == _float ? zynqmp::FP32 : zynqmp::FP16;
-      tensor_->mutableData<float>(dtype, input_shape);
-    }
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    size_t capatity_;
-
-    /* the current type of memory */
-    kTypeId_t type_;
-
-    zynqmp::Tensor *tensor_;
-    // zynqmp::Shape* shape_;
-  };
-};
-
-#ifdef PADDLE_MOBILE_DEBUG
-inline Print &operator<<(Print &printer, const Tensor &tensor) {
-  printer << " dims: " << tensor.dims() << "\n";
-  int stride = tensor.numel() / 20;
-  stride = stride > 0 ? stride : 1;
-  return printer;
-}
-
-#endif
-
-inline Tensor ReshapeToMatrix(const Tensor &src, int num_col_dims) {
-  Tensor res;
-  res.ShareDataWith(src);
-  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
-  return res;
-}
-
-}  // namespace framework
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/io/api.cc b/mobile/src/io/api.cc
deleted file mode 100644
index b9e7421b54bc4f0e092a6c743d39a81def48b09c..0000000000000000000000000000000000000000
--- a/mobile/src/io/api.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/type_define.h"
-#include "cstring"
-#include "io/paddle_inference_api.h"
-
-namespace paddle_mobile {
-
-int PaddleDtypeSize(PaddleDType dtype) {
-  switch (dtype) {
-    case PaddleDType::FLOAT32:
-      return sizeof(float);
-    case PaddleDType::INT64:
-      return sizeof(int64_t);
-    default:
-      assert(false);
-      return -1;
-  }
-}
-
-PaddleBuf::PaddleBuf(PaddleBuf&& other)
-    : data_(other.data_),
-      length_(other.length_),
-      memory_owned_(other.memory_owned_) {
-  other.memory_owned_ = false;
-  other.data_ = nullptr;
-  other.length_ = 0;
-}
-
-PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; }
-
-PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) {
-  // only the buffer with external memory can be copied
-  if (!other.memory_owned_) {
-    data_ = other.data_;
-    length_ = other.length_;
-    memory_owned_ = other.memory_owned_;
-  } else {
-    Resize(other.length());
-    memcpy(data_, other.data(), other.length());
-    length_ = other.length();
-    memory_owned_ = true;
-  }
-  return *this;
-}
-
-void PaddleBuf::Resize(size_t length) {
-  // Only the owned memory can be reset, the external memory can't be changed.
-  if (length_ == length) return;
-  if (memory_owned_) {
-    Free();
-  }
-  data_ = new char[length];
-  length_ = length;
-  memory_owned_ = true;
-}
-
-void PaddleBuf::Reset(void* data, size_t length) {
-  Free();
-  memory_owned_ = false;
-  data_ = data;
-  length_ = length;
-}
-
-void PaddleBuf::Free() {
-  if (memory_owned_ && data_) {
-    assert(length_ > 0);
-    delete[] static_cast<char*>(data_);
-    data_ = nullptr;
-    length_ = 0;
-  }
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/api_paddle_mobile.cc b/mobile/src/io/api_paddle_mobile.cc
deleted file mode 100644
index b01407bb3759eb18552a8d51f4826f69bb1bbe5f..0000000000000000000000000000000000000000
--- a/mobile/src/io/api_paddle_mobile.cc
+++ /dev/null
@@ -1,326 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "io/api_paddle_mobile.h"
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "common/enforce.h"
-#include "common/type_define.h"
-#include "framework/tensor.h"
-#ifdef PADDLE_MOBILE_FPGA
-#include <fpga/common/fpga_common.h>
-#endif
-
-namespace paddle_mobile {
-
-template <typename Device, typename T>
-PaddleMobilePredictor<Device, T>::PaddleMobilePredictor(
-    const PaddleMobileConfig &config) {
-  PADDLE_MOBILE_ENFORCE(Init(config) == true,
-                        "paddle mobile predictor init failed!");
-  config_ = config;
-}
-
-template <typename Device, typename T>
-bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
-  PaddleMobileConfigInternal configInternal;
-  configInternal.load_when_predict = config.load_when_predict;
-  if (config.pre_post_type == PaddleMobileConfig::UINT8_255) {
-    configInternal.pre_post_type = PrePostType::UINT8_255;
-  }
-
-  configInternal.memory_optimization_level =
-      config.mem_opt ? MemoryOptimizationWithoutFeeds : NoMemoryOptimization;
-
-  paddle_mobile_.reset(new PaddleMobile<Device, T>(configInternal));
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile_->SetCLPath(config.cl_path);
-#endif
-  if (config.memory_pack.from_memory) {
-    DLOG << "load from memory!";
-    paddle_mobile_->LoadCombinedMemory(
-        config.memory_pack.model_size, config.memory_pack.model_buf,
-        config.memory_pack.combined_params_size,
-        config.memory_pack.combined_params_buf, config.optimize,
-        config.quantification, config.batch_size, config.lod_mode);
-  } else if (!config.model_dir.empty()) {
-    paddle_mobile_->Load(config.model_dir, config.optimize,
-                         config.quantification, config.batch_size,
-                         config.lod_mode);
-  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
-    paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
-                         config.quantification, config.batch_size,
-                         config.lod_mode);
-  } else {
-    LOG(kLOG_ERROR) << "fail to load inference model!";
-    return false;
-  }
-  // If the openmp is open, set the thread num
-  paddle_mobile_->SetThreadNum(config.thread_num);
-  return true;
-}
-template <typename Device, typename T>
-bool PaddleMobilePredictor<Device, T>::Run(
-    const std::vector<PaddleTensor> &inputs,
-    std::vector<PaddleTensor> *output_data, int batch_size) {
-  if (inputs.empty()) {
-    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-  auto input = inputs[0];
-
-  if (input.lod.size() == 0 && input.shape.size() != 4) {
-    LOG(kLOG_ERROR) << "input shape not equal to 4!";
-    return false;
-  }
-  std::vector<int64_t> dims;
-  for (auto d : input.shape) {
-    dims.push_back(static_cast<int64_t>(d));
-  }
-
-  // use tensor
-  framework::DDim ddim = framework::make_ddim(dims);
-  int input_length = framework::product(ddim);
-  if (input.lod.size() > 0) {
-    framework::LoDTensor input_lod_tensor;
-    paddle_mobile::framework::LoD lod{{}};
-    for (int i = 0; i < input.lod.size(); ++i) {
-      lod[0].push_back(input.lod[i]);
-    }
-    input_lod_tensor.set_lod(lod);
-    input_lod_tensor.Resize(ddim);
-    if (input.dtype == UINT8) {
-      memcpy(input_lod_tensor.mutable_data<uint8_t>(),
-             static_cast<uint8_t *>(input.data.data()),
-             input_length * sizeof(uint8_t));
-    } else {
-      memcpy(input_lod_tensor.mutable_data<T>(),
-             static_cast<T *>(input.data.data()), input_length * sizeof(T));
-    }
-    paddle_mobile_->Predict(input_lod_tensor);
-  } else {
-    if (input.dtype == UINT8) {
-      framework::Tensor input_tensor(static_cast<uint8_t *>(input.data.data()),
-                                     ddim);
-      if (paddle_mobile_->Predict(input_tensor) != PMStatus::PMSuccess) {
-        return false;
-      }
-    } else {
-      framework::Tensor input_tensor(static_cast<T *>(input.data.data()), ddim);
-      if (paddle_mobile_->Predict(input_tensor) != PMStatus::PMSuccess) {
-        return false;
-      }
-    }
-  }
-
-  auto output_tensor = paddle_mobile_->Fetch();
-
-  if (output_data->empty()) {
-    LOG(kLOG_ERROR) << "At least one output should be set with tensors' names.";
-    return false;
-  }
-
-  auto &output = (*output_data)[0];
-  int output_length = output_tensor->numel();
-  std::vector<int64_t> tensor_shape =
-      framework::vectorize(output_tensor->dims());
-
-  for (auto d : tensor_shape) {
-    output.shape.push_back(static_cast<int>(d));
-  }
-
-  if (output.dtype == UINT8) {
-    if (output.data.length() < output_length * sizeof(uint8_t)) {
-      output.data.Resize(output_length * sizeof(uint8_t));
-    }
-
-    memcpy(output.data.data(), output_tensor->template data<uint8_t>(),
-           output_length * sizeof(uint8_t));
-  } else {
-    if (output.data.length() < output_length * sizeof(T)) {
-      output.data.Resize(output_length * sizeof(T));
-    }
-
-    memcpy(output.data.data(), output_tensor->template data<T>(),
-           output_length * sizeof(T));
-  }
-
-  return true;
-}
-
-template <typename Device, typename T>
-std::string PaddleMobilePredictor<Device, T>::GetExceptionMsg() {
-  return paddle_mobile_->GetExceptionMsg();
-}
-
-#ifdef PADDLE_MOBILE_FPGA
-void ConvertPaddleTensors(const PaddleTensor &src, framework::Tensor *des) {
-  des->Resize(framework::make_ddim(src.shape));
-  des->external_data = src.data.data();
-  des->set_type(static_cast<kTypeId_t>(static_cast<int>(src.dtypeid)));
-  des->layout =
-      src.layout == LAYOUT_HWC ? framework::LAYOUT_HWC : framework::LAYOUT_CHW;
-}
-
-void ConvertTensors(const framework::Tensor &src, PaddleTensor *des) {
-  des->shape = framework::vectorize2int(src.dims());
-  des->dtypeid = static_cast<PaddlekTypeId_t>(static_cast<int>(src.type()));
-  des->layout = src.layout == framework::LAYOUT_HWC ? LAYOUT_HWC : LAYOUT_CHW;
-
-  auto num = src.numel();
-  if (src.type() == type_id<float>()) {
-    des->data.Reset(const_cast<float *>(src.data<float>()),
-                    num * sizeof(float));
-  } else if (src.type() == type_id<half>()) {
-    des->data.Reset(const_cast<int16_t *>(src.data<int16_t>()),
-                    num * sizeof(int16_t));
-  } else {
-    des->data.Reset(const_cast<int8_t *>(src.data<int8_t>()),
-                    num * sizeof(int8_t));
-  }
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::FeedPaddleTensors(
-    const std::vector<PaddleTensor> &inputs) {
-  auto num = inputs.size();
-  std::vector<framework::Tensor> tensors(num, framework::Tensor());
-  for (int i = 0; i < num; i++) {
-    if (static_cast<kTypeId_t>(static_cast<int>(inputs[i].dtypeid)) ==
-        type_id<int8_t>().hash_code()) {
-      tensors[i].init(type_id<int8_t>().hash_code());
-    } else {
-      tensors[i].init(type_id<float>().hash_code());
-    }
-    ConvertPaddleTensors(inputs[i], &tensors[i]);
-  }
-  paddle_mobile_->FeedTensorData(tensors);
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(
-    std::vector<PaddleTensor> *outputs) {
-  //  auto num = outputs->size();
-  //  PADDLE_MOBILE_ENFORCE(num > 0, "0 output pointers is not permitted");
-  //  std::vector<framework::Tensor *> tensors(num, nullptr);
-  outputs->clear();
-  std::vector<framework::Tensor *> tensors;
-  paddle_mobile_->GetTensorResults(&tensors);
-  auto num = tensors.size();
-  outputs->resize(num, PaddleTensor());
-  for (int i = 0; i < num; i++) {
-    ConvertTensors(*tensors[i], &(*outputs)[i]);
-  }
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::FetchPaddleTensors(PaddleTensor *output,
-                                                          int id) {
-  std::shared_ptr<framework::Tensor> tensor_ptr =
-      paddle_mobile_->FetchResult(id);
-  void *data_addr = nullptr;
-  int data_sizeof = 1;
-  if (tensor_ptr.get()->type() == type_id<half>().hash_code()) {
-    data_addr = tensor_ptr.get()->data<half>();
-    data_sizeof = sizeof(half);
-  } else if (tensor_ptr.get()->type() == type_id<float>().hash_code()) {
-    data_addr = tensor_ptr.get()->data<float>();
-    data_sizeof = sizeof(float);
-  } else if (tensor_ptr.get()->type() == type_id<int8_t>().hash_code()) {
-    data_addr = tensor_ptr.get()->data<int8_t>();
-    data_sizeof = sizeof(int8_t);
-  } else {
-    PADDLE_MOBILE_ENFORCE(0, "output typeid is not supported");
-  }
-  size_t size = tensor_ptr.get()->numel() * data_sizeof;
-  fpga::fpga_invalidate(data_addr, size);
-  ConvertTensors(*(tensor_ptr.get()), output);
-  return;
-}
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::GetPaddleTensor(const std::string &name,
-                                                       PaddleTensor *output) {
-  framework::Tensor *t = paddle_mobile_->GetTensorByName(name);
-  ConvertTensors(*t, output);
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
-  paddle_mobile_->Predict_From_To(start, end);
-}
-
-#else
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::Feed(const std::string &var_name,
-                                            const PaddleTensor &input) {
-  framework::DDim ddim = framework::make_ddim(input.shape);
-  framework::Tensor input_tensor(static_cast<T *>(input.data.data()), ddim);
-  paddle_mobile_->Feed(var_name, input_tensor);
-}
-
-template <typename Device, typename T>
-void PaddleMobilePredictor<Device, T>::Fetch(const std::string &var_name,
-                                             PaddleTensor *output) {
-  auto output_tensor = paddle_mobile_->Fetch(var_name);
-  auto ddim = output_tensor->dims();
-
-  output->shape.clear();
-  for (int i = 0; i < ddim.size(); i++) {
-    output->shape.push_back(static_cast<int>(ddim[i]));
-  }
-
-  int length = output_tensor->numel() * sizeof(T);
-  if (output->data.length() < length) {
-    output->data.Resize(length);
-  }
-  memcpy(output->data.data(), output_tensor->template data<T>(), length);
-}
-
-template <typename Device, typename T>
-bool PaddleMobilePredictor<Device, T>::Run() {
-  paddle_mobile_->Predict();
-}
-#endif
-template <typename Device, typename T>
-PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {
-  paddle_mobile_->Clear();
-}
-
-// A factory to help create difference predictor.
-template <>
-std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<PaddleMobileConfig, PaddleEngineKind::kPaddleMobile>(
-    const PaddleMobileConfig &config) {
-  std::unique_ptr<PaddlePredictor> x;
-  if (config.precision == PaddleMobileConfig::FP32) {
-    if (config.device == PaddleMobileConfig::kCPU) {
-      x.reset(new PaddleMobilePredictor<CPU, float>(config));
-    } else if (config.device == PaddleMobileConfig::kFPGA) {
-      x.reset(new PaddleMobilePredictor<FPGA, float>(config));
-    } else if (config.device == PaddleMobileConfig::kGPU_CL) {
-      x.reset(new PaddleMobilePredictor<GPU_CL, float>(config));
-    } else {
-      LOG(kLOG_ERROR) << "unsupport device type!";
-      return nullptr;
-    }
-  } else {
-    LOG(kLOG_ERROR) << "unsupport precision type!";
-    return nullptr;
-  }
-  return std::move(x);
-}
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/api_paddle_mobile.h b/mobile/src/io/api_paddle_mobile.h
deleted file mode 100644
index 6a33e2812a0a8726d8db83d51a5ea2400633e30e..0000000000000000000000000000000000000000
--- a/mobile/src/io/api_paddle_mobile.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <memory>
-#include <string>
-#include <vector>
-#include "common/types.h"
-#include "io/paddle_inference_api.h"
-#include "io/paddle_mobile.h"
-
-namespace paddle_mobile {
-
-template <typename Device = CPU, typename T = float>
-class PaddleMobilePredictor : public PaddlePredictor {
- public:
-  PaddleMobilePredictor() = delete;
-
-  explicit PaddleMobilePredictor(const PaddleMobileConfig& config);
-
-  bool Run(const std::vector<PaddleTensor>& inputs,
-           std::vector<PaddleTensor>* output_data,
-           int batch_size = -1) override;
-  std::string GetExceptionMsg();
-#ifdef PADDLE_MOBILE_FPGA
-  void Predict_From_To(int start, int end) override;
-  void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) override;
-  void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
-  void FetchPaddleTensors(PaddleTensor* outputs, int id) override;
-  void GetPaddleTensor(const std::string& name, PaddleTensor* output) override;
-#else
-  void Feed(const std::string& var_name, const PaddleTensor& input);
-  void Fetch(const std::string& var_name, PaddleTensor* output);
-  bool Run();
-#endif
-
-  ~PaddleMobilePredictor() override;
-
- private:
-  std::unique_ptr<PaddleMobile<Device, T>> paddle_mobile_;
-  bool Init(const PaddleMobileConfig& config);
-
-  PaddleMobileConfig config_;
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/ios_io/PaddleMobileCPU.h b/mobile/src/io/ios_io/PaddleMobileCPU.h
deleted file mode 100644
index 07e10c0671bbcf8136ccadf8b019d3f2a10ca22f..0000000000000000000000000000000000000000
--- a/mobile/src/io/ios_io/PaddleMobileCPU.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#pragma once
-
-#import <CoreImage/CoreImage.h>
-#import <Foundation/Foundation.h>
-
-@interface PaddleMobileCPUResult: NSObject
-
-/**
- @b 输出指针
- */
-@property (assign, nonatomic, readonly) float *output;
-
-/**
- @b 输出的 float 数
- * */
-@property (assign, nonatomic, readonly) int outputSize;
-
-/**
- @b 维度信息, longlongValue
- */
-@property (strong, nonatomic, readonly) NSArray <NSNumber *> *dim;
-
--(void)releaseOutput;
-
-@end
-
-@interface  PaddleMobileCPUConfig: NSObject
-
-/**
- @b 默认为 1, 多线程时, 建议设置为 2
- */
-@property (assign, nonatomic) int threadNum;
-
-/**
- @b 是否开启运行时 infershape
- */
-@property  (assign, nonatomic) BOOL loddable;
-
-/**
- @b 是否开启模型 op 融合优化
- */
-@property  (assign, nonatomic) BOOL optimize;
-
-/**
- @b 是否预测时初始化内存，用于处理可变输入
- */
-@property  (assign, nonatomic) BOOL loadWhenPredict;
-
-@end
-
-@interface PaddleMobileCPU : NSObject
-
-/**
- @b 创建对象
-
- @param config 配置
- @return paddlemobile CPU 对象
- */
-- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config;
-
-/**
- @b 加载模型
-
- @param modelPath 模型路径
- @param weighsPath 权重路径
- @return 是否加载成功
- */
-- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath;
-
-/**
- @b 加载散开形式的模型, 需传入模型的目录
-
- @param modelAndWeightPath 模型和权重的路径
- @return 是否加载成功
- */
-- (BOOL)load:(NSString *)modelAndWeightPath;
-
-/**
- @b 从内存中加载模型
-
- @param modelLen 模型大小(字节数)
- @param modelBuf 模型在内存中的位置
- @param combinedParamsLen 权重大小(字节数)
- @param combinedParamsBuf 权重在内存中的位置
- @return 是否加载成功
- */
-- (BOOL)LoadCombinedMemory:(size_t)modelLen
-               andModelBuf:(const uint8_t *)modelBuf
-         andModelParamsLen:(size_t)combinedParamsLen
-      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf;
-
-/**
- @b 对图像进行预处理, 需要外部开辟 output 内存, 外部释放 output 内存, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-
- @param image 输入的图像
- @param output 预处理后的输出
- @param means 预处理中 means
- @param scale 预处理中的 scale
- @param dim 预处理后的维度
- */
--(void)preprocess:(CGImageRef)image
-           output:(float *)output
-            means:(NSArray<NSNumber *> *)means
-        scale:(float)scale
-        dim:(NSArray<NSNumber *> *)dim;
-
-/**
- 进行预测
-
- @param input 输入
- @param dim 输入维度
- @return 输出结果
- */
-- (PaddleMobileCPUResult *)predictInput:(float *)input
-                                    dim:(NSArray<NSNumber *> *)dim;
-
-/**
- @b 进行预测, means 和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-
- @param image 输入图像
- @param dim 输入维度
- @param means 预处理中 means
- @param scale 预处理中 scale
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale;
-
-/**
- @b 进行预测, means stds和 scale 为训练模型时的预处理参数, 如训练时没有做这些预处理则直接使用 predict, 每一个像素经过这样的预处理 (x + means) * scale, 其中 x 为像素值
-
- @param image 输入图像
- @param dim 输入维度
- @param means 预处理中 means
- @param stds 预处理中 stds
- @param scale 预处理中 scale
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means stds:(NSArray<NSNumber *> *)stds scale:(float)scale;
-
-/**
- @b 进行预测, 预处理 means 值为 0, scale 值为 1
-
- @param image 输入图像
- @param dim 输入维度
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim;
-
-
-/**
- @b 取出模型描述中 key 为 "fetch" 对应的输出
-
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)fetchOutput;
-
-/**
- @b 当输出为多个时, 可用此函数取出对应的输出
-
- @param key 模型中输出的key
- @return 预测结果
- */
-- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key;
-
-/**
- @b 清理内存
- */
-- (void)clear;
-
-@end
diff --git a/mobile/src/io/ios_io/PaddleMobileCPU.mm b/mobile/src/io/ios_io/PaddleMobileCPU.mm
deleted file mode 100644
index b952ad8e601fd4e00eb98ac398a8fad40045b7fd..0000000000000000000000000000000000000000
--- a/mobile/src/io/ios_io/PaddleMobileCPU.mm
+++ /dev/null
@@ -1,410 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-
-#import "PaddleMobileCPU.h"
-#import "framework/load_ops.h"
-#import "framework/tensor.h"
-#import "io/paddle_mobile.h"
-#import <memory>
-#import <vector>
-
-@interface PaddleMobileCPUResult()
-
--(void)toSetOutput:(float *)output;
-
--(void)toSetOutputSize:(int)outputSize;
-
-@end
-
-@implementation PaddleMobileCPUResult
-
--(void)releaseOutput {
-  delete [] _output;
-  _output = nil;
-  _outputSize = 0;
-}
-
--(void)toSetOutput:(float *)output {
-  _output = output;
-}
-
--(void)toSetOutputSize:(int)outputSize {
-  _outputSize = outputSize;
-}
-
--(void)toSetDim:(NSArray <NSNumber *> *)dim {
-  _dim = dim;
-}
-
-@end
-
-@implementation  PaddleMobileCPUConfig
-
--(instancetype)init {
-  if (self = [super init]) {
-    self.threadNum = 1;
-    self.optimize = YES;
-  }
-  return self;
-}
-
-@end
-
-@interface  PaddleMobileCPU()
-{
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU, float> *pam_;
-  BOOL loaded_;
-}
-
-@property (strong, nonatomic) PaddleMobileCPUConfig *config;
-
-@end
-
-@implementation PaddleMobileCPU
-
-static std::mutex shared_mutex;
-
-- (instancetype)initWithConfig:(PaddleMobileCPUConfig *)config {
-  if (self = [super init]) {
-    paddle_mobile::PaddleMobileConfigInternal configInternal;
-    configInternal.load_when_predict = config.loadWhenPredict;
-    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
-    _config = config;
-  }
-  return self;
-}
-
--(instancetype)init {
-  if (self = [super init]) {
-    _config = [[PaddleMobileCPUConfig alloc] init];
-    pam_ = new paddle_mobile::PaddleMobile<paddle_mobile::CPU, float>();
-  }
-  return self;
-}
-
-- (void)dealloc {
-  if (pam_) {
-    delete pam_;
-    pam_ = nullptr;
-  }
-}
-
-+ (instancetype)sharedInstance{
-  static dispatch_once_t onceToken;
-  static id sharedManager = nil;
-  dispatch_once(&onceToken, ^{
-    sharedManager = [[[self class] alloc] init];
-  });
-  return sharedManager;
-}
-
-- (BOOL)loadModel:(NSString *)modelPath andWeightsPath:(NSString *)weighsPath {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  std::string model_path_str = std::string([modelPath UTF8String]);
-  std::string weights_path_str = std::string([weighsPath UTF8String]);
-  pam_->SetThreadNum(self.config.threadNum);
-  if (loaded_ = pam_->Load(model_path_str, weights_path_str, self.config.optimize, false, 1, self.config.loddable)) {
-    return YES;
-  } else {
-    return NO;
-  }
-}
-
-- (BOOL)LoadCombinedMemory:(size_t)modelLen
-               andModelBuf:(const uint8_t *)modelBuf
-         andModelParamsLen:(size_t)combinedParamsLen
-      andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  pam_->SetThreadNum(self.config.threadNum);
-  return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen,
-          const_cast<uint8_t*>(combinedParamsBuf), self.config.optimize, false, 1, self.config.loddable);
-}
-
-- (BOOL)load:(NSString *)modelAndWeightPath{
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  std::string model_path_str = std::string([modelAndWeightPath UTF8String]);
-  if (loaded_ = pam_->Load(model_path_str, self.config.optimize, false, 1, self.config.loddable)) {
-    return YES;
-  } else {
-    return NO;
-  }
-}
-
-
--(void)preprocess:(CGImageRef)image
-           output:(float *)output
-            means:(NSArray<NSNumber *> *)means
-        scale:(float)scale
-        dim:(NSArray<NSNumber *> *)dim {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  if (means == nil) {
-    means = @[@0, @0, @0];
-  }
-
-  // dim to c++ vector, get numel
-  std::vector<int64_t > dim_vec;
-  int numel = 1;
-  for (int k = 0; k < dim.count; ++k) {
-    int d = dim[k].intValue;
-    numel *= d;
-    dim_vec.push_back(d);
-  }
-
-  const int sourceRowBytes = CGImageGetBytesPerRow(image);
-  const int imageWidth = CGImageGetWidth(image);
-  const int imageHeight = CGImageGetHeight(image);
-  const int imageChannels = 4;
-  CGDataProviderRef provider = CGImageGetDataProvider(image);
-  CFDataRef cfData = CGDataProviderCopyData(provider);
-  const UInt8 *input = CFDataGetBytePtr(cfData);
-
-  int wanted_input_width = dim_vec[3];
-  int wanted_input_height = dim_vec[2];
-  int wanted_input_channels = dim_vec[1];
-
-  for (int c = 0; c < wanted_input_channels; ++c) {
-    float *out_channel = output + c * wanted_input_height * wanted_input_width;
-    for (int y = 0; y < wanted_input_height; ++y) {
-      float *out_row = out_channel + y * wanted_input_width;
-      for (int x = 0; x < wanted_input_width; ++x) {
-        int in_row = (y * imageHeight) / wanted_input_height;
-        int in_col = (x * imageWidth) / wanted_input_width;
-        const UInt8 *in_pixel = input + (in_row * sourceRowBytes) + (in_col * imageChannels);
-        float *out_pos = out_row + x;
-        *out_pos = (in_pixel[2 - c] - means[c].floatValue) * scale;
-      }
-    }
-  }
-
-}
-
--(void)preprocess:(const UInt8 *)input output:(float *)output bytesPerRow:(int)bytesPerRow imageWidth:(int)imageWidth imageHeight:(int)imageHeight imageChannels:(int)imageChannels means:(NSArray<NSNumber *> *)means stds:(NSArray<NSNumber *> *)stds scale:(float)scale dim:(std::vector<int64_t>)dim {
-  if (means == nil) {
-    means = @[@0, @0, @0];
-  }
-  if (stds == nil) {
-    stds = @[@1, @1, @1];
-  }
-
-  int wanted_input_width = dim[3];
-  int wanted_input_height = dim[2];
-  int wanted_input_channels = dim[1];
-
-  for (int c = 0; c < wanted_input_channels; ++c) {
-    float *out_channel = output + c * wanted_input_height * wanted_input_width;
-    for (int y = 0; y < wanted_input_height; ++y) {
-      float *out_row = out_channel + y * wanted_input_width;
-      for (int x = 0; x < wanted_input_width; ++x) {
-        int in_row = (y * imageHeight) / wanted_input_height;
-        int in_col = (x * imageWidth) / wanted_input_width;
-        const UInt8 *in_pixel = input + (in_row * bytesPerRow) + (in_col * imageChannels);
-        float *out_pos = out_row + x;
-        *out_pos = (in_pixel[2 - c] - means[c].floatValue) / stds[c].floatValue * scale;
-      }
-    }
-  }
-}
-
-- (PaddleMobileCPUResult *)predictInput:(float *)input
-                      dim:(NSArray<NSNumber *> *)dim {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  if (!loaded_) {
-    printf("PaddleMobile doesn't be loaded yet");
-    return nil;
-  }
-
-  if (dim.count != 4) {
-    printf("dim must have 4 elements");
-    return nil;
-  }
-
-  // dim to c++ vector, get numel
-  std::vector<int64_t > dim_vec;
-  int numel = 1;
-  for (int k = 0; k < dim.count; ++k) {
-    int d = dim[k].intValue;
-    numel *= d;
-    dim_vec.push_back(d);
-  }
-
-  paddle_mobile::framework::Tensor input_tensor;
-  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
-  float *input_ptr = input_tensor.mutable_data<float>(dims);
-  memcpy(input_ptr, input,
-         numel * sizeof(float));
-
-  pam_->Predict(input_tensor);
-  std::shared_ptr<paddle_mobile::framework::Tensor> output = pam_->Fetch();
-
-  auto output_dims = output->dims();
-  std::vector<int64_t> output_dim_vec = vectorize(output_dims);
-  NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-  for (int i = 0; i < output_dim_vec.size(); ++i) {
-    NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]];
-    [ocDim addObject:num];
-  }
-
-  float *output_pointer = new float[output->numel()];
-
-  memcpy(output_pointer, output->data<float>(),
-         output->numel() * sizeof(float));
-
-  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-  [cpuResult toSetOutput: output_pointer];
-  [cpuResult toSetDim: ocDim];
-  [cpuResult toSetOutputSize: output->numel()];
-
-  return cpuResult;
-}
-
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means stds:(NSArray<NSNumber *> *)stds scale:(float)scale {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  if (!loaded_) {
-    printf("PaddleMobile doesn't be loaded yet");
-    return nil;
-  }
-
-  if (dim.count != 4) {
-    printf("dim must have 4 elements");
-    return nil;
-  }
-
-  // dim to c++ vector, get numel
-  std::vector<int64_t > dim_vec;
-  int numel = 1;
-  for (int k = 0; k < dim.count; ++k) {
-    int d = dim[k].intValue;
-    numel *= d;
-    dim_vec.push_back(d);
-  }
-
-  const int sourceRowBytes = CGImageGetBytesPerRow(image);
-  const int image_width = CGImageGetWidth(image);
-  const int image_height = CGImageGetHeight(image);
-  const int image_channels = 4;
-  CGDataProviderRef provider = CGImageGetDataProvider(image);
-  CFDataRef cfData = CGDataProviderCopyData(provider);
-  const UInt8 *input = CFDataGetBytePtr(cfData);
-
-  // sample image
-  float *output = (float *)malloc(numel*sizeof(float));
-  [self preprocess:input output:output bytesPerRow:sourceRowBytes imageWidth:image_width imageHeight:image_height imageChannels:image_channels means:means stds:stds scale:scale dim:dim_vec];
-  float *dataPointer = nullptr;
-  if (nullptr != output) {
-    dataPointer = output;
-  } else {
-    return nil;
-  }
-
-  paddle_mobile::framework::Tensor input_tensor;
-  paddle_mobile::framework::DDim dims = paddle_mobile::framework::make_ddim(dim_vec);
-  float *input_ptr = input_tensor.mutable_data<float>(dims);
-  memcpy(input_ptr, dataPointer,
-         numel * sizeof(float));
-
-  pam_->Predict(input_tensor);
-  std::shared_ptr<paddle_mobile::framework::Tensor> output_tensor = pam_->Fetch();
-
-  auto output_dims = output_tensor->dims();
-  std::vector<int64_t> output_dim_vec = vectorize(output_dims);
-  NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-  for (int i = 0; i < output_dim_vec.size(); ++i) {
-    NSNumber *num = [NSNumber numberWithLongLong:output_dim_vec[i]];
-    [ocDim addObject:num];
-  }
-
-  float *output_pointer = new float[output_tensor->numel()];
-  memcpy(output_pointer, output_tensor->data<float>(),
-         output_tensor->numel() * sizeof(float));
-  PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-  [cpuResult toSetOutput: output_pointer];
-  [cpuResult toSetDim: ocDim];
-  [cpuResult toSetOutputSize: output_tensor->numel()];
-
-  free(output);
-  CFRelease(cfData);
-  cfData = NULL;
-
-  return cpuResult;
-}
-
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim {
-  return [self predict:image dim:dim means:nil stds:nil scale:1];
-}
-
-- (PaddleMobileCPUResult *)predict:(CGImageRef)image dim:(NSArray<NSNumber *> *)dim means:(NSArray<NSNumber *> *)means scale:(float)scale {
-  return [self predict:image dim:dim means:means stds:nil scale:scale];
-}
-
-- (PaddleMobileCPUResult *)fetchOutput{
-  if (pam_ && loaded_) {
-    auto tensorPtr = pam_->Fetch();
-    float *output_pointer = new float[tensorPtr->numel()];
-    memcpy(output_pointer, tensorPtr->data<float>(),
-           tensorPtr->numel() * sizeof(float));
-    auto dims = tensorPtr->dims();
-    std::vector<int64_t> dim_vec = vectorize(dims);
-
-
-    NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-    for (int i = 0; i < dim_vec.size(); ++i) {
-      NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]];
-      [ocDim addObject:num];
-    }
-
-    PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-    [cpuResult toSetOutput: output_pointer];
-    [cpuResult toSetDim: ocDim];
-    [cpuResult toSetOutputSize: tensorPtr->numel()];
-
-    return cpuResult;
-  }
-  return nil;
-}
-
-- (PaddleMobileCPUResult *)fetchOutputWithKey:(NSString *)key{
-  if (pam_ && loaded_ && key.length) {
-    auto tensorPtr = pam_->Fetch(std::string([key cStringUsingEncoding:NSUTF8StringEncoding]));
-    float *output_pointer = new float[tensorPtr->numel()];
-    memcpy(output_pointer, tensorPtr->data<float>(),
-           tensorPtr->numel() * sizeof(float));
-
-    auto dims = tensorPtr->dims();
-    std::vector<int64_t> dim_vec = vectorize(dims);
-
-    NSMutableArray <NSNumber *> *ocDim = [NSMutableArray array];
-    for (int i = 0; i < dim_vec.size(); ++i) {
-      NSNumber *num = [NSNumber numberWithLongLong:dim_vec[i]];
-      [ocDim addObject:num];
-    }
-
-    PaddleMobileCPUResult *cpuResult = [[PaddleMobileCPUResult alloc] init];
-    [cpuResult toSetOutput: output_pointer];
-    [cpuResult toSetDim: ocDim];
-    [cpuResult toSetOutputSize: tensorPtr->numel()];
-
-    return cpuResult;
-  }
-  return nil;
-}
-
-- (void)clear{
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  if (pam_) {
-    pam_->Clear();
-  }
-}
-
-@end
diff --git a/mobile/src/io/jni/PML.java b/mobile/src/io/jni/PML.java
deleted file mode 100644
index 3f162dcf9ebe207d77b91bab5952d71c2fb3d0aa..0000000000000000000000000000000000000000
--- a/mobile/src/io/jni/PML.java
+++ /dev/null
@@ -1,66 +0,0 @@
-package com.baidu.paddle;
-
-public class PML {
-    /**
-     * load seperated model
-     *
-     * @param modelDir model dir
-     * @return isloadsuccess
-     */
-    public static native boolean load(String modelDir, Boolean lodMode);
-
-    /**
-     * load combined model
-     *
-     * @param modelPath model file path
-     * @param paramPath param file path
-     * @return isloadsuccess
-     */
-    public static native boolean loadCombined(String modelPath, String paramPath, Boolean lodMode);
-
-    /**
-     * load model and qualified params
-     *
-     * @param modelDir qualified model dir
-     * @return isloadsuccess
-     */
-    public static native boolean loadQualified(String modelDir, Boolean lodMode);
-
-    /**
-     * load model and qualified combined params
-     *
-     * @param modelPath model file path
-     * @param paramPath qualified param path
-     * @return isloadsuccess
-     */
-    public static native boolean loadCombinedQualified(String modelPath, String paramPath, Boolean lodMode);
-
-    /**
-     * predict image
-     *
-     * @param buf   of pretreated image (as your model like)
-     * @param ddims format of your input
-     * @return result
-     */
-    public static native float[] predictImage(float[] buf, int[] ddims);
-
-    public static native float[] fetch(String varName);
-
-    public static native float[] predictYuv(byte[] buf, int imgWidth, int imgHeight, int[] ddims, float[] meanValues);
-
-    // predict with variable length input
-    // support only one input and one output currently
-    public static native float[] predictLod(float[] buf);
-
-    /**
-     * clear model data
-     */
-    public static native void clear();
-
-    /**
-     * setThread num when u enable openmp
-     *
-     * @param threadCount threadCount
-     */
-    public static native void setThread(int threadCount);
-}
diff --git a/mobile/src/io/jni/paddle_mobile_jni.cpp b/mobile/src/io/jni/paddle_mobile_jni.cpp
deleted file mode 100644
index ee336889a205eb9ae63b112f3f48431ecdab17cc..0000000000000000000000000000000000000000
--- a/mobile/src/io/jni/paddle_mobile_jni.cpp
+++ /dev/null
@@ -1,465 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANDROID
-
-#include "io/jni/paddle_mobile_jni.h"
-#include <cmath>
-#include <string>
-#include <vector>
-#include "common/log.h"
-#include "framework/tensor.h"
-#include "io/paddle_mobile.h"
-
-#ifdef ENABLE_EXCEPTION
-#include "common/enforce.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-namespace paddle_mobile {
-namespace jni {
-
-using framework::DDim;
-using framework::Program;
-using framework::Tensor;
-using paddle_mobile::CPU;
-using std::string;
-
-paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-static std::mutex shared_mutex;
-
-PaddleMobile<CPU> *getPaddleMobileInstance() { return &paddle_mobile; }
-
-string jstring2cppstring(JNIEnv *env, jstring jstr) {
-  const char *cstr = env->GetStringUTFChars(jstr, 0);
-  string cppstr(cstr);
-  env->ReleaseStringUTFChars(jstr, cstr);
-  return cppstr;
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
-                                                          jclass thiz,
-                                                          jstring modelPath,
-                                                          jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("load invoked");
-  bool optimize = true;
-  bool isLoadOk = false;
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, false, 1,
-        static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, false, 1,
-                                             static_cast<bool>(lodMode));
-#endif
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("loadQualified invoked");
-  bool optimize = true;
-  bool qualified = true;
-  bool isLoadOk = false;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, qualified, 1,
-        static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, qualified, 1,
-                                             static_cast<bool>(lodMode));
-#endif
-
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("loadCombined invoked");
-  bool optimize = true;
-  bool isLoadOk = false;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-        optimize, false, 1, static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(
-      jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-      optimize, false, 1, static_cast<bool>(lodMode));
-#endif
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-  ANDROIDLOGI("loadCombinedQualified invoked");
-  bool optimize = true;
-  bool qualified = true;
-  bool isLoadOk = false;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-        optimize, qualified, 1, static_cast<bool>(lodMode));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-    isLoadOk = false;
-  }
-#else
-  isLoadOk = getPaddleMobileInstance()->Load(
-      jstring2cppstring(env, modelPath), jstring2cppstring(env, paramPath),
-      optimize, qualified, 1, static_cast<bool>(lodMode));
-#endif
-  return static_cast<jboolean>(isLoadOk);
-}
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("predictImage invoked");
-  jfloatArray result = NULL;
-
-#ifdef ENABLE_EXCEPTION
-  ANDROIDLOGE("ENABLE_EXCEPTION!");
-
-  try {
-    jsize ddim_size = env->GetArrayLength(ddims);
-    if (ddim_size != 4) {
-      ANDROIDLOGE("ddims size not equal to 4");
-    }
-    jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-    framework::DDim ddim = framework::make_ddim(
-        {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-    int length = framework::product(ddim);
-    int count = 0;
-    float *dataPointer = nullptr;
-    if (nullptr != buf) {
-      dataPointer = env->GetFloatArrayElements(buf, NULL);
-    }
-    framework::Tensor input;
-    input.Resize(ddim);
-    auto input_ptr = input.mutable_data<float>();
-    for (int i = 0; i < length; i++) {
-      input_ptr[i] = dataPointer[i];
-    }
-    getPaddleMobileInstance()->Predict(input);
-    auto output = getPaddleMobileInstance()->Fetch();
-    count = output->numel();
-    result = env->NewFloatArray(count);
-    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-    env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-    env->DeleteLocalRef(ddims);
-    env->ReleaseFloatArrayElements(buf, dataPointer, 0);
-    env->DeleteLocalRef(buf);
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  jsize ddim_size = env->GetArrayLength(ddims);
-  if (ddim_size != 4) {
-    ANDROIDLOGE("ddims size not equal to 4");
-  }
-  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-  framework::DDim ddim = framework::make_ddim(
-      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-  int length = framework::product(ddim);
-  int count = 0;
-  float *dataPointer = nullptr;
-  if (nullptr != buf) {
-    dataPointer = env->GetFloatArrayElements(buf, NULL);
-  }
-  framework::Tensor input;
-  input.Resize(ddim);
-  auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < length; i++) {
-    input_ptr[i] = dataPointer[i];
-  }
-  getPaddleMobileInstance()->Predict(input);
-  auto output = getPaddleMobileInstance()->Fetch();
-  count = output->numel();
-  result = env->NewFloatArray(count);
-  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-  env->DeleteLocalRef(ddims);
-  env->ReleaseFloatArrayElements(buf, dataPointer, 0);
-  env->DeleteLocalRef(buf);
-//  env->DeleteLocalRef(dataPointer);
-#endif
-
-  ANDROIDLOGI("predictImage finished");
-  return result;
-}
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_fetch(JNIEnv *env,
-                                                              jclass thiz,
-                                                              jstring varName) {
-  jfloatArray result = NULL;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    auto output =
-        getPaddleMobileInstance()->Fetch(jstring2cppstring(env, varName));
-    int count = output->numel();
-    result = env->NewFloatArray(count);
-    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  auto output =
-      getPaddleMobileInstance()->Fetch(jstring2cppstring(env, varName));
-  int count = output->numel();
-  result = env->NewFloatArray(count);
-  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-#endif
-
-  return result;
-}
-
-inline int yuv_to_rgb(int y, int u, int v, float *r, float *g, float *b) {
-  int r1 = (int)(y + 1.370705 * (v - 128));                         // NOLINT
-  int g1 = (int)(y - 0.698001 * (u - 128) - 0.703125 * (v - 128));  // NOLINT
-  int b1 = (int)(y + 1.732446 * (u - 128));                         // NOLINT
-
-  r1 = (int)fminf(255, fmaxf(0, r1));  // NOLINT
-  g1 = (int)fminf(255, fmaxf(0, g1));  // NOLINT
-  b1 = (int)fminf(255, fmaxf(0, b1));  // NOLINT
-  *r = r1;
-  *g = g1;
-  *b = b1;
-
-  return 0;
-}
-void convert_nv21_to_matrix(uint8_t *nv21, float *matrix, int width, int height,
-                            int targetWidth, int targetHeight, float *means) {
-  const uint8_t *yData = nv21;
-  const uint8_t *vuData = nv21 + width * height;
-
-  const int yRowStride = width;
-  const int vuRowStride = width;
-
-  float scale_x = width * 1.0 / targetWidth;
-  float scale_y = height * 1.0 / targetHeight;
-
-  for (int j = 0; j < targetHeight; ++j) {
-    int y = j * scale_y;
-    const uint8_t *pY = yData + y * yRowStride;
-    const uint8_t *pVU = vuData + (y >> 1) * vuRowStride;
-    for (int i = 0; i < targetWidth; ++i) {
-      int x = i * scale_x;
-      const int offset = ((x >> 1) << 1);
-      float r = 0;
-      float g = 0;
-      float b = 0;
-      yuv_to_rgb(pY[x], pVU[offset + 1], pVU[offset], &r, &g, &b);
-      int r_index = j * targetWidth + i;
-      int g_index = r_index + targetWidth * targetHeight;
-      int b_index = g_index + targetWidth * targetHeight;
-      matrix[r_index] = r - means[0];
-      matrix[g_index] = g - means[1];
-      matrix[b_index] = b - means[2];
-    }
-  }
-}
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
-    JNIEnv *env, jclass thiz, jbyteArray yuv_, jint imgwidth, jint imgHeight,
-    jintArray ddims, jfloatArray meanValues) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("predictYuv invoked");
-  jfloatArray result = NULL;
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    jsize ddim_size = env->GetArrayLength(ddims);
-    if (ddim_size != 4) {
-      ANDROIDLOGE("ddims size not equal to 4");
-    }
-    jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-    framework::DDim ddim = framework::make_ddim(
-        {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-    int length = framework::product(ddim);
-    float matrix[length];  // NOLINT
-    jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
-    float *meansPointer = nullptr;
-    if (nullptr != meanValues) {
-      meansPointer = env->GetFloatArrayElements(meanValues, NULL);
-    }
-    convert_nv21_to_matrix(reinterpret_cast<uint8_t *>(yuv), matrix, imgwidth,
-                           imgHeight, ddim[3], ddim[2], meansPointer);
-    int count = 0;
-    framework::Tensor input;
-    input.Resize(ddim);
-    auto input_ptr = input.mutable_data<float>();
-    for (int i = 0; i < length; i++) {
-      input_ptr[i] = matrix[i];
-    }
-    getPaddleMobileInstance()->Predict(input);
-    auto output = getPaddleMobileInstance()->Fetch();
-    count = output->numel();
-    result = env->NewFloatArray(count);
-    env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-    env->ReleaseByteArrayElements(yuv_, yuv, 0);
-    env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-    env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
-    ANDROIDLOGI("predictYuv finished");
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  jsize ddim_size = env->GetArrayLength(ddims);
-  if (ddim_size != 4) {
-    ANDROIDLOGE("ddims size not equal to 4");
-  }
-  jint *ddim_ptr = env->GetIntArrayElements(ddims, NULL);
-  framework::DDim ddim = framework::make_ddim(
-      {ddim_ptr[0], ddim_ptr[1], ddim_ptr[2], ddim_ptr[3]});
-  int length = framework::product(ddim);
-  float matrix[length];  // NOLINT
-  jbyte *yuv = env->GetByteArrayElements(yuv_, NULL);
-  float *meansPointer = nullptr;
-  if (nullptr != meanValues) {
-    meansPointer = env->GetFloatArrayElements(meanValues, NULL);
-  }
-  convert_nv21_to_matrix((uint8_t *)yuv, matrix, imgwidth,  // NOLINT
-                         imgHeight, ddim[3], ddim[2], meansPointer);
-  int count = 0;
-  framework::Tensor input;
-  input.Resize(ddim);
-  auto input_ptr = input.mutable_data<float>();
-  for (int i = 0; i < length; i++) {
-    input_ptr[i] = matrix[i];
-  }
-  getPaddleMobileInstance()->Predict(input);
-  auto output = getPaddleMobileInstance()->Fetch();
-  count = output->numel();
-  result = env->NewFloatArray(count);
-  env->SetFloatArrayRegion(result, 0, count, output->data<float>());
-  env->ReleaseByteArrayElements(yuv_, yuv, 0);
-  env->ReleaseIntArrayElements(ddims, ddim_ptr, 0);
-  env->ReleaseFloatArrayElements(meanValues, meansPointer, 0);
-  ANDROIDLOGI("predictYuv finished");
-#endif
-
-  return result;
-}
-JNIEXPORT jlongArray JNICALL
-Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  jlong *ddim_ptr = env->GetLongArrayElements(buf, NULL);
-  jsize ddim_size = env->GetArrayLength(buf);
-  std::vector<int64_t> ids;
-
-  for (int i = 0; i < ddim_size; ++i) {
-    jlong x = ddim_ptr[i];
-    ids.push_back((int64_t)x);
-  }
-
-  paddle_mobile::framework::LoDTensor words;
-
-  auto size = static_cast<int>(ids.size());
-
-  paddle_mobile::framework::LoD lod{{0, ids.size()}};
-  DDim dims{size, 1};
-  words.Resize(dims);
-  words.set_lod(lod);
-  auto *pdata = words.mutable_data<int64_t>();
-  size_t n = words.numel() * sizeof(int64_t);
-  memcpy(pdata, ids.data(), n);
-  paddle_mobile.Predict(words);
-  auto vec_result = paddle_mobile.Fetch();
-  int count = vec_result->numel();
-  jlongArray result = NULL;
-  ANDROIDLOGE("predict nlp size %d", count);
-
-  result = env->NewLongArray(count);
-  env->SetLongArrayRegion(result, 0, count, vec_result->data<int64_t>());
-
-  env->ReleaseLongArrayElements(buf, ddim_ptr, 0);
-  return result;
-}
-
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
-                                                           jclass thiz,
-                                                           jint threadCount) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-  ANDROIDLOGI("setThreadCount %d", threadCount);
-#ifdef ENABLE_EXCEPTION
-  try {
-    getPaddleMobileInstance()->SetThreadNum(static_cast<int>(threadCount));
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  getPaddleMobileInstance()->SetThreadNum(static_cast<int>(threadCount));
-#endif
-}
-
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz) {
-  std::lock_guard<std::mutex> lock(shared_mutex);
-
-#ifdef ENABLE_EXCEPTION
-  try {
-    getPaddleMobileInstance()->Clear();
-  } catch (paddle_mobile::PaddleMobileException &e) {
-    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
-  }
-#else
-  getPaddleMobileInstance()->Clear();
-#endif
-}
-
-}  // namespace jni
-}  // namespace paddle_mobile
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/mobile/src/io/jni/paddle_mobile_jni.h b/mobile/src/io/jni/paddle_mobile_jni.h
deleted file mode 100644
index 16d676872319be5332d3dbb29d32417d9aac35d4..0000000000000000000000000000000000000000
--- a/mobile/src/io/jni/paddle_mobile_jni.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef ANDROID
-#include <jni.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-namespace paddle_mobile {
-namespace jni {
-/**
- * load separated model for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_load(JNIEnv *env,
-                                                          jclass thiz,
-                                                          jstring modelPath,
-                                                          jboolean lodMode);
-
-/**
- * load separated qualified model for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jboolean lodMode);
-/**
- * load combined model  for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombined(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode);
-
-/**
- * load combined qualified model for android
- */
-JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_PML_loadCombinedQualified(
-    JNIEnv *env, jclass thiz, jstring modelPath, jstring paramPath,
-    jboolean lodMode);
-
-/**
- * object detection for anroid
- */
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictImage(
-    JNIEnv *env, jclass thiz, jfloatArray buf, jintArray ddims);
-
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_fetch(JNIEnv *env,
-                                                              jclass thiz,
-                                                              jstring varName);
-
-/**
- * object detection for anroid
- */
-JNIEXPORT jfloatArray JNICALL Java_com_baidu_paddle_PML_predictYuv(
-    JNIEnv *env, jclass thiz, jbyteArray yuv, jint imgwidth, jint imgHeight,
-    jintArray ddims, jfloatArray meanValues);
-
-/**
- * object detection for anroid
- */
-JNIEXPORT jlongArray JNICALL
-Java_com_baidu_paddle_PML_predictLod(JNIEnv *env, jclass thiz, jlongArray buf);
-
-/**
- * setThreadCount for multithread
- */
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_setThread(JNIEnv *env,
-                                                           jclass thiz,
-                                                           jint threadCount);
-/**
- * clear data of the net when destroy for android
- */
-JNIEXPORT void JNICALL Java_com_baidu_paddle_PML_clear(JNIEnv *env,
-                                                       jclass thiz);
-}  // namespace jni
-}  // namespace paddle_mobile
-#ifdef __cplusplus
-}
-#endif
-
-#endif
diff --git a/mobile/src/io/loader.h b/mobile/src/io/loader.h
deleted file mode 100644
index 7a04da1230cb78ba61f5c2746e2c29348b293b2b..0000000000000000000000000000000000000000
--- a/mobile/src/io/loader.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "common/types.h"
-#include "framework/program/program.h"
-
-namespace paddle_mobile {
-
-template <typename Dtype = CPU, Precision P = Precision::FP32>
-class Loader {
- public:
-  const framework::Program<Dtype, P> Load(const std::string &dirname,
-                                          bool optimize = false,
-                                          bool quantification = false,
-                                          bool can_add_split = false);
-
-  const framework::Program<Dtype, P> Load(const std::string &model_path,
-                                          const std::string &para_path,
-                                          bool optimize = false,
-                                          bool quantification = false);
-
-  const framework::Program<Dtype, P> LoadCombinedMemory(
-      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-      const uint8_t *combined_params_buf, bool optimize = false,
-      bool quantification = false);
-
- private:
-  const framework::Program<Dtype, P> LoadProgram(const std::string &model_path,
-                                                 bool optimize = false,
-                                                 bool quantification = false,
-                                                 bool can_add_split = false);
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp
deleted file mode 100644
index 636cd1b760801497932606a1cfaae047ed85a994..0000000000000000000000000000000000000000
--- a/mobile/src/io/opencl_interface.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_MOBILE_CL
-
-#include "io/opencl_interface.h"
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_scope.h"
-
-namespace paddle_mobile {
-
-cl_context getContext() {
-  return framework::CLEngine::Instance()->getContext();
-}
-
-cl_command_queue getClCommandQueue() {
-  return framework::CLEngine::Instance()->getClCommandQueue();
-}
-
-bool isInitSuccess() {
-  prepareOpenclRuntime();
-  return framework::CLEngine::Instance()->isInitSuccess();
-}
-
-bool prepareOpenclRuntime() {
-#ifdef PREPARE_OPENCL_RUNTIME
-  DLOG << "cl runtime prepared. ";
-  cl_uint numPlatforms;  // the NO. of platforms
-  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
-  if (status == CL_SUCCESS) {
-    if (numPlatforms > 0) {
-      cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
-          malloc(numPlatforms * sizeof(cl_platform_id)));
-      status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-      free(platforms);
-    }
-  }
-#endif
-  return true;
-}
-
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h
deleted file mode 100644
index 6a3608790a98638e207fd20dd6f9f05ea54d9e3d..0000000000000000000000000000000000000000
--- a/mobile/src/io/opencl_interface.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_MOBILE_CL
-#include "CL/cl.h"
-
-namespace paddle_mobile {
-
-cl_context getContext();
-cl_command_queue getClCommandQueue();
-bool isInitSuccess();
-bool prepareOpenclRuntime();
-
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/io/paddle_inference_api.h b/mobile/src/io/paddle_inference_api.h
deleted file mode 100644
index 6f3ba182f6f3ff41763ec950f2632ae288bdf03b..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_inference_api.h
+++ /dev/null
@@ -1,238 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the definition of a simple Inference API for Paddle.
- *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include <cassert>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle_mobile {
-
-#ifdef PADDLE_MOBILE_FPGA
-
-namespace fpga {
-int open_device();
-int close_device();
-void* fpga_malloc(size_t size);
-void fpga_free(void* ptr);
-
-//  Usage:
-//  auto version = fpga::paddle_mobile_version();
-//  std::cout << "0X0" << std::hex << version << std::endl;
-uint32_t paddle_mobile_version();
-}  // namespace fpga
-#endif
-
-enum PaddleDType {
-  FLOAT32,
-  FLOAT16,
-  INT64,
-  INT8,
-  UINT8,
-};
-
-enum LayoutType {
-  LAYOUT_CHW = 1,
-  LAYOUT_HWC = 0,
-};
-
-class PaddleBuf {
- public:
-  PaddleBuf() = default;
-  PaddleBuf(PaddleBuf&& other);
-  // Copy only available when memory is managed externally.
-  explicit PaddleBuf(const PaddleBuf&);
-  PaddleBuf& operator=(const PaddleBuf&);
-  // Do not own the memory.
-  PaddleBuf(void* data, size_t length)
-      : data_(data), length_(length), memory_owned_{false} {}
-  // Own memory.
-  explicit PaddleBuf(size_t length)
-      : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Resize to `length` bytes.
-  void Resize(size_t length);
-  // Reset to external memory.
-  void Reset(void* data, size_t length);
-  bool empty() const { return length_ == 0; }
-  void* data() const { return data_; }
-  size_t length() const { return length_; }
-
-  ~PaddleBuf() { Free(); }
-
- private:
-  void Free();
-  void* data_{nullptr};  // pointer to the data memory.
-  size_t length_{0};     // number of memory bytes.
-  bool memory_owned_{true};
-};
-
-typedef enum {
-  paddle_void = 0,
-  paddle_float,
-  paddle_int,
-  paddle_uint16_t,
-  paddle_double,
-  paddle_int64_t,
-  paddle_size_t,
-  paddle_int16_t,
-  paddle_int8_t,
-  paddle_uint8_t,
-  paddle_bool,
-  paddle_string,
-  paddle_floats = 100,
-  paddle_ints,
-  paddle_int64_ts,
-  paddle_size_ts,
-  paddle_bools,
-  paddle_strings,
-  paddle_const_float = 200,
-  paddle_const_int,
-  paddle_block = 300,
-  paddle_tensor,
-  paddle_lod_tensor,
-  paddle_blocks,
-  paddle_tensors,
-  paddle_lod_tensors,
-  paddle_p_block = 400,
-  paddle_p_tensor,
-  paddle_p_lod_tensor,
-  paddle_p_blocks,
-  paddle_p_tensors,
-  paddle_p_lod_tensors,
-  paddle_scopes = 500,
-  paddle_selected_rows,
-  paddle_dim0 = 600,
-  paddle_dim1,
-  paddle_dim2,
-  paddle_dim3,
-  paddle_dim4,
-  paddle_dim5,
-  paddle_dim6,
-  paddle_dim7,
-  paddle_dim8,
-  paddle_dim9,
-#ifdef PADDLE_MOBILE_CL
-  paddle_cl_image,
-#endif
-} PaddlekTypeId_t;
-
-struct PaddleTensor {
-  PaddleTensor() = default;
-  std::string name;  // variable name.
-  std::vector<int> shape;
-  std::vector<int> lod;
-  PaddleBuf data;  // blob of data.
-  PaddleDType dtype;
-  PaddlekTypeId_t dtypeid;
-  LayoutType layout;
-};
-
-enum class PaddleEngineKind {
-  kPaddleMobile,
-  // TODO(Superjomn) support following engines latter.
-  // kTensorRT,           // Use TensorRT for inference.
-  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-  // kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-};
-
-/*
- * A simple Inference API for Paddle. Currently this API can be used by
- * non-sequence scenerios.
- */
-class PaddlePredictor {
- public:
-  struct Config;
-  PaddlePredictor(const PaddlePredictor&) = delete;
-  PaddlePredictor& operator=(const PaddlePredictor&) = delete;
-
-  // Predict an record.
-  // The caller should be responsible for allocating and releasing the memory of
-  // `inputs`. `inputs` should be available until Run returns. Caller should be
-  // responsible for the output tensor's buffer, either allocated or passed from
-  // outside.
-
-  virtual bool Run(const std::vector<PaddleTensor>& inputs,
-                   std::vector<PaddleTensor>* output_data,
-                   int batch_size = -1) = 0;
-  virtual std::string GetExceptionMsg() { return ""; }
-  // Destroy the Predictor.
-  virtual ~PaddlePredictor() = default;
-
-  // The common configs for all the predictors.
-  struct Config {
-    std::string model_dir;  // path to the model directory.
-    std::string prog_file;
-    std::string param_file;
-  };
-#ifdef PADDLE_MOBILE_FPGA
-  virtual void Predict_From_To(int start, int end) = 0;
-  virtual void FeedPaddleTensors(const std::vector<PaddleTensor>& inputs) = 0;
-  virtual void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) = 0;
-  virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0;
-  virtual void GetPaddleTensor(const std::string& name,
-                               PaddleTensor* output) = 0;
-#else
-  virtual void Feed(const std::string& var_name, const PaddleTensor& input) = 0;
-  virtual void Fetch(const std::string& var_name, PaddleTensor* output) = 0;
-  virtual bool Run() = 0;
-#endif
-
- protected:
-  PaddlePredictor() = default;
-};
-
-struct PaddleModelMemoryPack {
-  bool from_memory = false;
-  size_t model_size = 0;
-  uint8_t* model_buf = nullptr;
-  size_t combined_params_size = 0;
-  uint8_t* combined_params_buf = nullptr;
-};
-
-struct PaddleMobileConfig : public PaddlePredictor::Config {
-  enum Precision { FP32 = 0 };
-  enum Device { kCPU = 0, kFPGA = 1, kGPU_MALI = 2, kGPU_CL = 3 };
-  enum PrePostType { NONE_PRE_POST = 0, UINT8_255 = 1 };
-
-  enum Precision precision;
-  enum Device device;
-  enum PrePostType pre_post_type;
-
-  int batch_size = 1;
-  bool optimize = true;
-  bool quantification = false;
-  int quantification_fold = 1;
-  bool lod_mode = false;
-  int thread_num = 1;
-  bool load_when_predict = false;
-  bool mem_opt = true;
-  std::string cl_path;
-  struct PaddleModelMemoryPack memory_pack;
-};
-
-// A factory to help create different predictors.
-template <typename ConfigT,
-          PaddleEngineKind engine = PaddleEngineKind::kPaddleMobile>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile.cpp b/mobile/src/io/paddle_mobile.cpp
deleted file mode 100644
index be69ce0f63803d714b77fc6e81805cec7339f9dd..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_mobile.cpp
+++ /dev/null
@@ -1,550 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/paddle_mobile.h"
-#include <utility>
-#include "common/common.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
-#ifdef PADDLE_MOBILE_CL
-#include <CL/cl.h>
-#include <mutex>  // NOLINT
-#include "framework/cl/cl_engine.h"
-#include "framework/cl/cl_tensor.h"
-#endif
-#include "operators/math/gemm.h"
-
-namespace paddle_mobile {
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::SetThreadNum(int thread_num,
-                                           PowerMode power_mode) {
-  executor_->SetThreadNum(thread_num, power_mode);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Load(const std::string &dirname,
-                                       bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode,
-                                       int quantification_fold) {
-  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Device, T>>();
-  } else {
-    LOG(kLOG_INFO) << "loader inited";
-  }
-
-  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(dirname, optimize, quantification, false,
-                      quantification_fold),
-        config_, batch_size, optimize, lod_mode);
-  } else {
-    LOG(kLOG_INFO) << "executor inited";
-  }
-
-  return PMSuccess;
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Load(const std::string &model_path,
-                                       const std::string &para_path,
-                                       bool optimize, bool quantification,
-                                       int batch_size, bool lod_mode,
-                                       int quantification_fold) {
-  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Device, T>>();
-  } else {
-    LOG(kLOG_INFO) << "loader inited";
-    LOG(kLOG_INFO) << "loader inited";
-  }
-
-  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->Load(model_path, para_path, optimize, quantification,
-                      quantification_fold),
-        config_, batch_size, optimize, lod_mode);
-  } else {
-    LOG(kLOG_INFO) << "executor inited";
-  }
-
-  return PMSuccess;
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Load(const PaddleMobileConfig &config) {
-  if (!config.model_dir.empty()) {
-    return this->Load(config.model_dir, config.optimize, config.quantification,
-                      config.batch_size, config.lod_mode,
-                      config.quantification_fold);
-  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
-    return this->Load(config.prog_file, config.param_file, config.optimize,
-                      config.quantification, config.batch_size, config.lod_mode,
-                      config.quantification_fold);
-  } else {
-    LOG(kLOG_ERROR) << "Failed to load inference model";
-    return PMNotInitialized;
-  }
-}
-
-template <typename Device, typename T>
-bool PaddleMobile<Device, T>::LoadCombinedMemory(
-    size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-    uint8_t *combined_params_buf, bool optimize, bool quantification,
-    int batch_size, bool lod_mode, int quantification_fold) {
-  if (loader_.get() == nullptr) {
-    loader_ = std::make_shared<framework::Loader<Device, T>>();
-  } else {
-    LOG(kLOG_INFO) << "loader inited";
-  }
-  if (executor_.get() == nullptr) {
-    executor_ = std::make_shared<framework::Executor<Device, T>>(
-        loader_->LoadCombinedMemory(model_len, model_buf, combined_params_len,
-                                    combined_params_buf, optimize,
-                                    quantification, quantification_fold),
-        config_, batch_size, optimize, lod_mode);
-  } else {
-    LOG(kLOG_INFO) << "executor inited";
-  }
-
-  return PMSuccess;
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(const framework::Tensor &input) {
-  std::vector<std::pair<std::string, framework::Tensor>> inputs;
-  inputs.push_back(std::make_pair("feed", input));
-  return this->Predict(inputs);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(const framework::LoDTensor &input) {
-  std::vector<std::pair<std::string, framework::LoDTensor>> inputs;
-  inputs.push_back(std::make_pair("feed", input));
-  return this->Predict(inputs);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(
-    const std::vector<std::pair<std::string, framework::Tensor>> &inputs) {
-  return executor_->Predict(inputs);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict(
-    const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs) {
-  return executor_->Predict(inputs);
-}
-
-template <typename Device, typename T>
-std::vector<T> PaddleMobile<Device, T>::Predict(
-    const std::vector<T> &input, const std::vector<int64_t> &dims) {
-  return executor_->Predict(input, dims);
-}
-
-template <typename Device, typename T>
-PMStatus PaddleMobile<Device, T>::Predict() {
-  return executor_->Predict();
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const std::string &var_name,
-                                   const framework::Tensor &input) {
-  executor_->SetInput(input, var_name);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Feed(const std::string &var_name,
-                                   const framework::LoDTensor &input) {
-  executor_->SetInput(input, var_name);
-}
-
-typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
-template <typename Device, typename T>
-LoDTensorPtr PaddleMobile<Device, T>::Fetch(const std::string &var_name) {
-  return executor_->GetOutput(var_name);
-}
-
-#ifdef PADDLE_MOBILE_CL
-template <typename Device, typename T>
-const framework::CLImage *PaddleMobile<Device, T>::FetchImage(
-    const std::string &var_name) {
-  return executor_->GetOutputImage(var_name);
-}
-#endif
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Clear() {
-  executor_ = nullptr;
-  loader_ = nullptr;
-}
-
-template <typename Device, typename T>
-double PaddleMobile<Device, T>::GetPredictTime() {}
-
-template <typename Device, typename T>
-std::string PaddleMobile<Device, T>::GetExceptionMsg() {
-  if (executor_.get() != nullptr) {
-    return executor_->GetExceptionMsg();
-  }
-  return "";
-}
-
-#ifdef PADDLE_MOBILE_CPU
-template <>
-double PaddleMobile<CPU, float>::GetPredictTime() {
-  int m = 32;
-  int n = 224 * 224;
-  int k = 27;
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  float *a =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float *b =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
-  float *c =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  int t1 = 1;
-  int t2 = 1;
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = t1 + rand() % t2;  // NOLINT
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = t1 + rand() % t2;  // NOLINT
-  }
-
-  operators::math::Gemm gemm;
-  auto time1 = paddle_mobile::time();
-  int times = 4;
-  for (int j = 0; j < times; ++j) {
-    gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-               static_cast<float>(0), c, ldc, false,
-               static_cast<float *>(nullptr));
-  }
-
-  auto time2 = paddle_mobile::time();
-  double cost = paddle_mobile::time_diff(time1, time2) / times;
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  return cost;
-}
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::InjectVariable(const framework::Tensor &t,
-                                             std::string var_name) {
-  executor_->InjectVariable(t, var_name);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
-  executor_->FeedData(t);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedData(const std::vector<void *> &v) {
-  executor_->FeedData(v);
-}
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::FeedTensorData(
-    const std::vector<framework::Tensor> &v) {
-  executor_->FeedTensorData(v);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::GetResults(std::vector<void *> *v) {
-  executor_->GetResults(v);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::GetTensorResults(
-    std::vector<framework::Tensor *> *v) {
-  executor_->GetTensorResults(v);
-}
-
-template <typename Device, typename T>
-framework::Tensor *PaddleMobile<Device, T>::GetTensorByName(
-    const std::string &name) {
-  return executor_->GetTensorByName(name);
-}
-
-template <typename Device, typename T>
-std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
-    int id) {
-  return executor_->FetchResult(id);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Predict_From_To(int start, int end) {
-  executor_->Predict_From_To(start, end);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Predict_From(int start) {
-  executor_->Predict_From(start);
-}
-
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::Predict_To(int end) {
-  executor_->Predict_To(end);
-}
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-static std::mutex lc;
-template <typename Device, typename T>
-void PaddleMobile<Device, T>::SetCLPath(std::string path) {
-  std::lock_guard<std::mutex> lock(lc);
-  if (framework::CLEngine::Instance()->GetCLPath() == "") {
-    framework::CLEngine::Instance()->setClPath(path);
-  }
-}
-template <>
-double PaddleMobile<GPU_CL, float>::GetPredictTime() {
-  cl_int status;
-  if (!framework::CLEngine::Instance()->isInitSuccess()) {
-    return -1;
-  }
-  cl_context context = framework::CLEngine::Instance()->getContext();
-  cl_command_queue queue = framework::CLEngine::Instance()->getClCommandQueue();
-
-  int n = 1;
-  int c = 3;
-  int h = 224;
-  int w = 224;
-  float *input = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * 3 * 224 * 224));
-  float *filter = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * 32 * 27));
-  int input_w = w * (c + 3) / 4;
-  int input_h = n * h;
-  int filter_w = 3 * (3 + 3) / 4;
-  int filter_h = 32 * 3;
-  int output_w = 224 * (32 + 3) / 4;
-  int output_h = 1 * 224;
-
-  framework::DDim input_dims = {1, 3, 224, 224};
-  framework::CLTensor input_cl_tensor(context, queue);
-  input_cl_tensor.Resize(input_dims);
-  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input);
-
-  framework::DDim filter_dims = {32, 3, 3, 3};
-  framework::CLTensor filter_cl_tensor(context, queue);
-  input_cl_tensor.Resize(filter_dims);
-  cl_mem filterBuffer = filter_cl_tensor.mutable_with_data<float>(filter);
-
-  cl_mem cl_filter_image = NULL;
-  cl_mem cl_input_image = NULL;
-  cl_mem cl_output_image = NULL;
-  cl_image_format cf = {.image_channel_order = CL_RGBA,
-                        .image_channel_data_type = CL_HALF_FLOAT};
-  cl_input_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf, input_w,
-                                   input_h, 0, NULL, &status);
-  cl_filter_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
-                                    filter_w, filter_h, 0, NULL, &status);
-  cl_output_image = clCreateImage2D(context, CL_MEM_READ_WRITE | 0, &cf,
-                                    output_w, output_h, 0, NULL, &status);
-  char *code;
-  std::string path = framework::CLEngine::Instance()->GetCLPath() +
-                     "/cl_kernel/feed_kernel.cl";
-  size_t length = readText(path.c_str(), &code);
-  cl_program program = clCreateProgramWithSource(
-      context, 1, (const char **)&code, &length, NULL);
-  std::string path1 = "-cl-fast-relaxed-math -I " +
-                      framework::CLEngine::Instance()->GetCLPath() +
-                      "/cl_kernel";
-  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
-  cl_kernel kernel = clCreateKernel(program, "feed", &status);
-
-  int out_H = 224;
-  int out_W = 224;
-  int out_C = 3;
-  int Stride2 = out_C * out_H * out_W;
-  int Stride1 = out_H * out_W;
-  int Stride0 = out_W;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  size_t global_work_size[3] = {1, 224, 224};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size,
-                                  NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  out_H = 3;
-  out_W = 3;
-  out_C = 3;
-  Stride2 = out_C * out_H * out_W;
-  Stride1 = out_H * out_W;
-  Stride0 = out_W;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &filterBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_filter_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  size_t global_work_size1[3] = {1, 3, 96};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size1,
-                                  NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  clFinish(queue);
-  //  queue = clCreateCommandQueue(context, listDevice[0], 0, &status);
-
-  path = framework::CLEngine::Instance()->GetCLPath() +
-         "/cl_kernel/conv_kernel.cl";
-  size_t length1 = readText(path.c_str(), &code);
-  program = clCreateProgramWithSource(context, 1, (const char **)&code,
-                                      &length1, &status);
-  CL_CHECK_ERRORS(status);
-  clBuildProgram(program, 0, 0, path1.c_str(), NULL, NULL);
-  kernel = clCreateKernel(program, "conv_3x3", &status);
-  CL_CHECK_ERRORS(status);
-
-  int c_block = (32 + 3) / 4;
-  int nh = n * h;
-  int stride = 1;
-  int offset = 0;
-  int input_c = (c + 3) / 4;
-  int dilation = 1;
-  int input_width = 224;
-  int input_height = 224;
-  int output_width = 224;
-  int output_height = 224;
-  int has_group = 0;
-  int filter_channel = 3;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &cl_input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &cl_filter_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &cl_output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &filter_channel);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 15, sizeof(int), &has_group);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-  size_t global_work_size2[3] = {8, 224, 224};
-  auto time1 = paddle_mobile::time();
-  int times = 10;
-  for (int i = 0; i < times; ++i) {
-    status = clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size2,
-                                    NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-  clFinish(queue);
-  auto time2 = paddle_mobile::time();
-  paddle_mobile::memory::Free(input);
-  paddle_mobile::memory::Free(filter);
-  if (status == CL_SUCCESS) {
-    return paddle_mobile::time_diff(time1, time2) / times;
-  } else {
-    return -1;
-  }
-}
-template <typename Device, typename T>
-int PaddleMobile<Device, T>::readText(
-    const char *kernelPath,
-    char **pcode) {  // 读取文本文件放入 pcode，返回字符串长度
-  FILE *fp;
-  int size;
-  // printf("<readText> File: %s\n", kernelPath);
-  fp = fopen(kernelPath, "rb");
-  if (!fp) {
-    printf("<readText> Open file failed\n");
-    return -1;
-  }
-  if (fseek(fp, 0, SEEK_END) != 0) {
-    printf("<readText> Seek end of file failed\n");
-    return -1;
-  }
-  if ((size = ftell(fp)) < 0) {
-    printf("<readText> Get file position failed\n");
-    return -1;
-  }
-  rewind(fp);
-  if ((*pcode = reinterpret_cast<char *>(malloc(size + 1))) == NULL) {
-    printf("<readText> Allocate space failed\n");
-    return -1;
-  }
-  fread(*pcode, 1, size, fp);
-  (*pcode)[size] = '\0';
-  fclose(fp);
-  return size + 1;
-}
-#endif
-
-template class PaddleMobile<CPU, float>;
-template class PaddleMobile<FPGA, float>;
-template class PaddleMobile<GPU_CL, float>;
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h
deleted file mode 100644
index 8c40b0696ad0f4daf782a71a1816b66a3a2c95df..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_mobile.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "common/types.h"
-#include "framework/executor.h"
-#include "framework/load_ops.h"
-#include "framework/loader.h"
-#include "framework/tensor.h"
-#include "io/paddle_inference_api.h"
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_engine.h"
-#include "io/opencl_interface.h"
-#endif
-
-namespace paddle_mobile {
-
-template <typename Device, typename T = float>
-class PaddleMobile {
- public:
-  explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
-    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
-#ifndef PADDLE_MOBILE_CL
-    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
-#else
-    if (is_gpu) {
-      prepareOpenclRuntime();
-    }
-#endif
-  }
-
-  PaddleMobile() {
-    bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
-#ifndef PADDLE_MOBILE_CL
-    PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
-#else
-    if (is_gpu) {  // recheck when run cpu in with opencl.
-      prepareOpenclRuntime();
-    }
-#endif
-  }
-  virtual ~PaddleMobile() { Clear(); }
-
-  PMStatus Load(const std::string &dirname, const bool optimize = false,
-                const bool quantification = false, const int batch_size = 1,
-                const bool lod_mode = false, const int quantification_fold = 1);
-  PMStatus Load(const std::string &model_path, const std::string &para_path,
-                const bool optimize = false, const bool quantification = false,
-                const int batch_size = 1, const bool lod_mode = false,
-                const int quantification_fold = 1);
-
-  PMStatus Load(const PaddleMobileConfig &config);
-
-  PMStatus Predict(const framework::Tensor &input);
-  PMStatus Predict(const framework::LoDTensor &input);
-
-  PMStatus Predict(
-      const std::vector<std::pair<std::string, framework::Tensor>> &inputs);
-  PMStatus Predict(
-      const std::vector<std::pair<std::string, framework::LoDTensor>> &inputs);
-
-  std::vector<T> Predict(const std::vector<T> &input,
-                         const std::vector<int64_t> &dims);
-  PMStatus Predict();
-
-  void Feed(const std::string &var_name, const framework::LoDTensor &input);
-  void Feed(const std::string &var_name, const framework::Tensor &input);
-
-  typedef std::shared_ptr<framework::LoDTensor> LoDTensorPtr;
-  LoDTensorPtr Fetch(const std::string &var_name);
-#ifdef PADDLE_MOBILE_CL
-  const framework::CLImage *FetchImage(const std::string &var_name);
-#endif
-
-  LoDTensorPtr Fetch() { return Fetch("fetch"); }
-
-  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
-                          size_t combined_params_len,
-                          uint8_t *combined_params_buf, bool optimize = false,
-                          bool quantification = false, int batch_size = 1,
-                          bool lod_mode = false, int quantification_fold = 1);
-
-  void SetThreadNum(int thread_num,
-                    PowerMode power_mode = PERFORMANCE_PRIORITY);
-  void Clear();
-  double GetPredictTime();
-  std::string GetExceptionMsg();
-
-#ifdef PADDLE_MOBILE_FPGA
-  void InjectVariable(const framework::Tensor &t, std::string var_name);
-  void FeedData(const framework::Tensor &t);
-  void FeedData(const std::vector<void *> &v);
-  void FeedTensorData(const std::vector<framework::Tensor> &v);
-
-  void GetResults(std::vector<void *> *v);
-  void GetTensorResults(std::vector<framework::Tensor *> *v);
-  framework::Tensor *GetTensorByName(const std::string &name);
-
-  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
-  void Predict_From_To(int start = 0, int end = -1);
-  void Predict_From(int start);
-  void Predict_To(int end);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
- public:  // NOLINT
-  void SetCLPath(std::string cl_path);
-  int readText(const char *kernelPath,
-               char **pcode);  // 读取文本文件放入 pcode，返回字符串长度
-#endif
-
- private:
-  std::shared_ptr<framework::Loader<Device, T>> loader_;
-  std::shared_ptr<framework::Executor<Device, T>> executor_;
-  PaddleMobileConfigInternal config_;
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile_wrap.cpp b/mobile/src/io/paddle_mobile_wrap.cpp
deleted file mode 100644
index b8fd3097e2be1bae12e707580ba52000de3fd773..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_mobile_wrap.cpp
+++ /dev/null
@@ -1,361 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/paddle_mobile_wrap.h"
-
-#include "io/api_paddle_mobile.h"
-#include "io/paddle_mobile.h"
-
-namespace paddle_mobile {
-namespace wrap {
-
-#ifndef PADDLE_MOBILE_FPGA
-
-// ddim class
-int DDim::size() { return dims.size(); }
-
-int64_t &DDim::operator[](int idx) {
-  if (0 <= idx && idx < dims.size()) {
-    return dims[idx];
-  }
-  int64_t non_exist = 0;
-  return non_exist;
-}
-
-int64_t DDim::operator[](int idx) const {
-  if (0 <= idx && idx < dims.size()) {
-    return dims[idx];
-  }
-  return 0;
-}
-
-DDim make_ddim(const std::vector<int64_t> &dims) {
-  DDim ddim;
-  for (auto dim : dims) {
-    ddim.dims.push_back(dim);
-  }
-  return ddim;
-}
-
-// tensor class
-
-Tensor::Tensor(float *data, DDim ddim) {
-  this->data_ = data;
-  this->ddim_ = ddim;
-}
-
-float *Tensor::data() const { return this->data_; }
-
-DDim Tensor::dims() const { return this->ddim_; }
-
-// net class
-
-void Net::SetThreadNum(int threads) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      engine->SetThreadNum(threads);
-    }
-  }
-}
-
-void Net::SetCLPath(std::string path) {
-#ifdef PADDLE_MOBILE_CL
-  if (this->device_ == kGPU_CL) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    engine->SetCLPath(path);
-  }
-#endif
-}
-
-bool Net::Load(const std::string &dirname, const bool optimize,
-               const bool quantification, const int batch_size,
-               const bool lod_mode) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(dirname, optimize, quantification, batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(dirname, optimize, quantification, batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-bool Net::Load(const std::string &model_path, const std::string &para_path,
-               const bool optimize, const bool quantification,
-               const int batch_size, const bool lod_mode) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(model_path, para_path, optimize, quantification,
-                       batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status =
-          engine->Load(model_path, para_path, optimize, quantification,
-                       batch_size, lod_mode);
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-bool Net::LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
-                             size_t combined_params_len,
-                             uint8_t *combined_params_buf, bool optimize,
-                             bool quantification, int batch_size,
-                             bool lod_mode) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      bool status = engine->LoadCombinedMemory(
-          model_len, model_buf, combined_params_len, combined_params_buf,
-          optimize, quantification, batch_size, lod_mode);
-      return status;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      bool status = engine->LoadCombinedMemory(
-          model_len, model_buf, combined_params_len, combined_params_buf,
-          optimize, quantification, batch_size, lod_mode);
-      return status;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-std::vector<float> Net::Predict(const std::vector<float> &input,
-                                const std::vector<int64_t> &dims) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto result = engine->Predict(input, dims);
-      return result;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto result = engine->Predict(input, dims);
-      return result;
-    }
-#else
-    return std::vector<float>();
-#endif
-  }
-  return std::vector<float>();
-}
-
-bool Net::Predict() {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status = engine->Predict();
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      paddle_mobile::PMStatus status = engine->Predict();
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-bool Net::Predict(const Tensor &input) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      paddle_mobile::PMStatus status = engine->Predict(input_inner);
-      return status == paddle_mobile::PMSuccess;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      paddle_mobile::PMStatus status = engine->Predict(input_inner);
-      return status == paddle_mobile::PMSuccess;
-    }
-#else
-    return false;
-#endif
-  }
-  return false;
-}
-
-void Net::Feed(const std::string &var_name, const Tensor &input) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      engine->Feed(var_name, input_inner);
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto input_data = input.data();
-      auto input_dims = input.dims();
-      std::vector<int64_t> input_dims_as_vector = input_dims.dims;
-      paddle_mobile::framework::Tensor input_inner(
-          input_data,
-          paddle_mobile::framework::make_ddim(input_dims_as_vector));
-      engine->Feed(var_name, input_inner);
-    }
-#else
-    return;
-#endif
-  }
-}
-
-std::shared_ptr<Tensor> Net::Fetch(const std::string &var_name) {
-  if (this->device_ == kCPU) {
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-    if (engine != nullptr) {
-      auto output_inner = engine->Fetch(var_name);
-      auto ddim_inner = output_inner->dims();
-      std::vector<int64_t> ddim_as_vector;
-      for (int i = 0; i < ddim_inner.size(); i++) {
-        ddim_as_vector.push_back(ddim_inner[i]);
-      }
-      auto ddim = make_ddim(ddim_as_vector);
-      auto output_data = output_inner->data<float>();
-      std::shared_ptr<Tensor> ptr(new Tensor(output_data, ddim));
-      return ptr;
-    }
-  } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-    auto engine =
-        (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-    if (engine != nullptr) {
-      auto output_inner = engine->Fetch(var_name);
-      auto ddim_inner = output_inner->dims();
-      std::vector<int64_t> ddim_as_vector;
-      for (int i = 0; i < ddim_inner.size(); i++) {
-        ddim_as_vector.push_back(ddim_inner[i]);
-      }
-      auto ddim = make_ddim(ddim_as_vector);
-      auto output_data = output_inner->data<float>();
-      std::shared_ptr<Tensor> ptr(new Tensor(output_data, ddim));
-      return ptr;
-    }
-#else
-    return nullptr;
-#endif
-  }
-  return nullptr;
-}
-
-Net::Net(DeviceTypeEnum device) {
-  if (this->engine_ == nullptr) {
-    PaddleMobileConfigInternal config;
-    this->device_ = device;
-    if (this->device_ == kCPU) {
-      this->engine_ =
-          new paddle_mobile::PaddleMobile<paddle_mobile::CPU>(config);
-    } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-      this->engine_ =
-          new paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL>(config);
-#endif
-    }
-  }
-}
-
-Net::~Net() {
-  if (this->engine_ != nullptr) {
-    if (this->device_ == kCPU) {
-      auto engine =
-          (paddle_mobile::PaddleMobile<paddle_mobile::CPU> *)this->engine_;
-      delete engine;
-      this->engine_ = nullptr;
-    } else if (this->device_ == kGPU_CL) {
-#ifdef PADDLE_MOBILE_CL
-      auto engine =
-          (paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> *)this->engine_;
-      delete engine;
-      this->engine_ = nullptr;
-#endif
-    }
-  }
-}
-
-#endif
-
-}  // namespace wrap
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_mobile_wrap.h b/mobile/src/io/paddle_mobile_wrap.h
deleted file mode 100644
index 28c954dbc749c394e7e9a61515311fff4e804829..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_mobile_wrap.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-namespace paddle_mobile {
-namespace wrap {
-
-#ifndef PADDLE_MOBILE_FPGA
-
-// device type
-__attribute__((__visibility__("default"))) enum DeviceTypeEnum {
-  kCPU = 0,
-  kGPU_CL = 1
-};
-
-// ddim class
-class DDim {
- public:
-  __attribute__((__visibility__("default"))) int size();
-  __attribute__((__visibility__("default"))) int64_t &operator[](int idx);
-  __attribute__((__visibility__("default"))) int64_t operator[](int idx) const;
-
-  __attribute__((__visibility__("default"))) std::vector<int64_t> dims;
-};
-__attribute__((__visibility__("default"))) DDim make_ddim(
-    const std::vector<int64_t> &dims);
-
-// tensor class
-class Tensor {
- public:
-  __attribute__((__visibility__("default"))) Tensor(float *data, DDim ddim);
-
-  __attribute__((__visibility__("default"))) float *data() const;
-  __attribute__((__visibility__("default"))) DDim dims() const;
-
- private:
-  float *data_;
-  DDim ddim_;
-};
-
-// net class
-class Net {
- public:
-  __attribute__((__visibility__("default"))) Net(DeviceTypeEnum device);
-  __attribute__((__visibility__("default"))) ~Net();
-  __attribute__((__visibility__("default"))) void SetThreadNum(int thread_num);
-  __attribute__((__visibility__("default"))) void SetCLPath(std::string path);
-  __attribute__((__visibility__("default"))) bool Load(
-      const std::string &dirname, const bool optimize = false,
-      const bool quantification = false, const int batch_size = 1,
-      const bool lod_mode = false);
-  __attribute__((__visibility__("default"))) bool Load(
-      const std::string &model_path, const std::string &para_path,
-      const bool optimize = false, const bool quantification = false,
-      const int batch_size = 1, const bool lod_mode = false);
-  __attribute__((__visibility__("default"))) bool LoadCombinedMemory(
-      size_t model_len, const uint8_t *model_buf, size_t combined_params_len,
-      uint8_t *combined_params_buf, bool optimize = false,
-      bool quantification = false, int batch_size = 1, bool lod_mode = false);
-  __attribute__((__visibility__("default"))) std::vector<float> Predict(
-      const std::vector<float> &input, const std::vector<int64_t> &dims);
-  __attribute__((__visibility__("default"))) bool Predict();
-  __attribute__((__visibility__("default"))) bool Predict(const Tensor &input);
-  __attribute__((__visibility__("default"))) void Feed(
-      const std::string &var_name, const Tensor &input);
-  __attribute__((__visibility__("default"))) std::shared_ptr<Tensor> Fetch(
-      const std::string &var_name);
-
- private:
-  void *engine_ = nullptr;
-  DeviceTypeEnum device_;
-};
-
-#endif
-
-}  // namespace wrap
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_test_inference_api.cpp b/mobile/src/io/paddle_test_inference_api.cpp
deleted file mode 100644
index d0c6c48c2006cd710eadb6160f49702d98c2e031..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_test_inference_api.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "io/paddle_test_inference_api.h"
-#include "io/paddle_mobile.h"
-
-namespace paddle_mobile {
-
-template <typename Device, typename T>
-double PaddleTester<Device, T>::CaculatePredictTime(std::string *cl_path) {
-  PaddleMobile<Device, T> paddle_mobile;
-#ifdef PADDLE_MOBILE_CL
-  if (cl_path) {
-    paddle_mobile.SetCLPath(*cl_path);
-  }
-
-#endif
-  return paddle_mobile.GetPredictTime();
-}
-template class PaddleTester<CPU, float>;
-template class PaddleTester<FPGA, float>;
-
-template class PaddleTester<GPU_CL, float>;
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/io/paddle_test_inference_api.h b/mobile/src/io/paddle_test_inference_api.h
deleted file mode 100644
index 47680a49da149d1f6d53b1845e6012fce5472b52..0000000000000000000000000000000000000000
--- a/mobile/src/io/paddle_test_inference_api.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * This file contains the definition of a simple Inference API for Paddle.
- *
- * ATTENTION: It requires some C++ features, for lower version C++ or C, we
- * might release another API.
- */
-
-#pragma once
-
-#include "common/types.h"
-#include "string"
-
-namespace paddle_mobile {
-
-template <typename Device, typename T = float>
-class PaddleTester {
- public:
-  double CaculatePredictTime(std::string *cl_path = nullptr);
-};
-
-}  // namespace paddle_mobile
diff --git a/mobile/src/memory/t_malloc.cpp b/mobile/src/memory/t_malloc.cpp
deleted file mode 100755
index f48a75d3f629161ae970ce44ed3eee6156be9626..0000000000000000000000000000000000000000
--- a/mobile/src/memory/t_malloc.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "memory/t_malloc.h"
-#include <cstdlib>
-#include <cstring>
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-#include "fpga/KD/llapi/zynqmp_api.h"
-#endif
-
-namespace paddle_mobile {
-namespace memory {
-const int MALLOC_ALIGN = 64;
-
-#ifdef PADDLE_MOBILE_FPGA
-namespace fpga = paddle_mobile::fpga;
-
-void Copy(void *dst, const void *src, size_t num) {
-  fpga::fpga_copy(dst, src, num);
-}
-
-void *Alloc(size_t size) { return fpga::fpga_malloc(size); }
-
-void Free(void *ptr) {
-  if (ptr) {
-    fpga::fpga_free(ptr);
-  }
-}
-
-#elif defined(PADDLE_MOBILE_FPGA_KD)
-
-void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
-}
-
-void *Alloc(size_t size) { return zynqmp::fpga_malloc(size); }
-
-void Free(void *ptr) {
-  if (ptr) {
-    zynqmp::fpga_free(ptr);
-  }
-}
-#else
-
-void Copy(void *dst, const void *src, size_t num) {
-  std::memcpy(dst, src, num);
-}
-
-void *Alloc(size_t size) {
-  // segmentation fault if size_t overflow on 32-bit platforms
-  // user should check before calling this function
-  size_t offset = sizeof(void *) + MALLOC_ALIGN - 1;
-  char *p = static_cast<char *>(malloc(offset + size));
-  if (!p) {
-    return nullptr;
-  }
-  void *r = reinterpret_cast<void *>(reinterpret_cast<size_t>(p + offset) &
-                                     (~(MALLOC_ALIGN - 1)));
-  static_cast<void **>(r)[-1] = p;
-  return r;
-}
-
-void Free(void *ptr) {
-  if (ptr) {
-    free(static_cast<void **>(ptr)[-1]);
-  }
-}
-
-#endif
-
-}  // namespace memory
-}  // namespace paddle_mobile
diff --git a/mobile/src/memory/t_malloc.h b/mobile/src/memory/t_malloc.h
deleted file mode 100644
index b57403b5151fd9ce8bfe4e551eff19851012ef36..0000000000000000000000000000000000000000
--- a/mobile/src/memory/t_malloc.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cstddef>
-#include <type_traits>
-
-namespace paddle_mobile {
-namespace memory {
-
-void Copy(void *dst, const void *src, size_t num);
-
-void *Alloc(size_t size);
-
-void Free(void *ptr);
-
-/**
- * \brief   Free memory block in one place.
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *          static_cast
- */
-template <typename T>
-class PODDeleter {
-  static_assert(std::is_pod<T>::value, "T must be POD");
-
- public:
-  explicit PODDeleter(){};
-
-  void operator()(T *ptr) { Free(static_cast<void *>(ptr)); }
-};
-
-/**
- * \brief   Free memory block in one place does not meet POD
- *
- * \note    In some cases, custom deleter is used to
- *          deallocate the memory automatically for
- *          std::unique_ptr<T> in tensor.h.
- *          reinterpret_cast
- */
-template <typename T>
-class PlainDeleter {
- public:
-  explicit PlainDeleter(){};
-
-  void operator()(T *ptr) { Free(reinterpret_cast<void *>(ptr)); }
-};
-}  // namespace memory
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/activation_op.cpp b/mobile/src/operators/activation_op.cpp
deleted file mode 100755
index 905b881fee616e9714f3f1bde55867c8c0a658fb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/activation_op.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#define DEFINE_ACTIVATION_INFERSHAPE(OpName)                \
-  template <typename Dtype, typename T>                     \
-  void OpName##Op<Dtype, T>::InferShape() const {           \
-    const auto &input_dims = this->param_.InputX()->dims(); \
-    this->param_.Out()->Resize(input_dims);                 \
-  }
-
-#ifdef RELU_OP
-DEFINE_ACTIVATION_INFERSHAPE(Relu);
-DEFINE_ACTIVATION_INFERSHAPE(Relu6);
-#endif  // RELU_OP
-
-#ifdef SIGMOID_OP
-DEFINE_ACTIVATION_INFERSHAPE(Sigmoid);
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(sigmoid, ops::SigmoidOp);
-#endif
-#endif  // SIGMOID_OP
-
-#ifdef TANH_OP
-DEFINE_ACTIVATION_INFERSHAPE(Tanh);
-#endif  // TANH_OP
-
-#ifdef LOG_OP
-DEFINE_ACTIVATION_INFERSHAPE(Log);
-#endif  // LOG_OP
-
-#ifdef LEAKY_RELU_OP
-DEFINE_ACTIVATION_INFERSHAPE(LeakyRelu);
-#endif  // LEAKY_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef RELU_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
-REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(relu, ops::ReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(relu, ops::ReluOp);
-REGISTER_OPERATOR_CL(relu6, ops::Relu6Op);
-#endif
-#endif  // RELU_OP
-
-#ifdef SIGMOID_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(sigmoid, ops::SigmoidOp);
-#endif
-#endif  // SIGMOID_OP
-
-#ifdef TANH_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(tanh, ops::TanhOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(tanh, ops::TanhOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(tanh, ops::TanhOp);
-#endif
-#endif  // TANH_OP
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef LOG_OP
-REGISTER_OPERATOR_CPU(log, ops::LogOp);
-#endif  // LOG_OP
-#endif
-
-#ifdef LEAKY_RELU_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(leaky_relu, ops::LeakyReluOp);
-#endif  // LEAKY_RELU_OP
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(leaky_relu, ops::LeakyReluOp);
-#endif
-#endif
diff --git a/mobile/src/operators/activation_op.h b/mobile/src/operators/activation_op.h
deleted file mode 100644
index cd250080e58b314da142f3110dd8a36f1b72c619..0000000000000000000000000000000000000000
--- a/mobile/src/operators/activation_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/activation_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef RELU_OP
-DECLARE_OPERATOR(Relu, ReluParam, ReluKernel);
-DECLARE_OPERATOR(Relu6, Relu6Param, Relu6Kernel);
-#endif
-
-#ifdef SIGMOID_OP
-DECLARE_OPERATOR(Sigmoid, SigmoidParam, SigmoidKernel);
-#endif
-
-#ifdef TANH_OP
-DECLARE_OPERATOR(Tanh, TanhParam, TanhKernel);
-#endif
-
-#ifdef LOG_OP
-DECLARE_OPERATOR(Log, ReluParam, LogKernel);
-#endif
-
-#ifdef LEAKY_RELU_OP
-DECLARE_OPERATOR(LeakyRelu, LeakyReluParam, LeakyReluKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/assign_op.cpp b/mobile/src/operators/assign_op.cpp
deleted file mode 100644
index adc038a2230be67f118a6005202600fcb4c10b15..0000000000000000000000000000000000000000
--- a/mobile/src/operators/assign_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#include "operators/assign_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void AssignOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (X) of Assign op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
-                        "Output (Output) of Assign op should not be null.");
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(assign, ops::AssignOp);
-#endif
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/assign_op.h b/mobile/src/operators/assign_op.h
deleted file mode 100644
index 478330bc3bf1c6cfa4ec1a2e87331a25a0e1d1b6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/assign_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/assign_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Assign, AssignParam, AssignKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/assign_value_op.cpp b/mobile/src/operators/assign_value_op.cpp
deleted file mode 100644
index 5100c2246bd5a2840d503914e5f4057827e162dd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/assign_value_op.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#include "operators/assign_value_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void AssignValueOp<Dtype, T>::InferShape() const {
-  const auto &shape = this->param_.shape_;
-  this->param_.output_->Resize(framework::make_ddim(shape));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(assign_value, ops::AssignValueOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(assign_value, ops::AssignValueOp);
-#endif
-
-#endif  // ASSIGN_VALUE_OP
diff --git a/mobile/src/operators/assign_value_op.h b/mobile/src/operators/assign_value_op.h
deleted file mode 100644
index ce319d333a86911626188436d2fb211a90b77acc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/assign_value_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/assign_value_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(AssignValue, AssignValueParam, AssignValueKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/batchnorm_op.cpp b/mobile/src/operators/batchnorm_op.cpp
deleted file mode 100644
index 3a272845cce9e67ddbca75cf7c691e5ea3355b99..0000000000000000000000000000000000000000
--- a/mobile/src/operators/batchnorm_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/batchnorm_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BatchNormOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.OutputY()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(batch_norm, ops::BatchNormOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(batch_norm, ops::BatchNormOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/batchnorm_op.h b/mobile/src/operators/batchnorm_op.h
deleted file mode 100644
index ed46c8657f4a505364f41f4a9695bf9f4cb57fc9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/batchnorm_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/batchnorm_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class BatchNormOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           BatchNormParam<DeviceType>,
-                                           BatchNormKernel<DeviceType, T>> {
- public:
-  BatchNormOp(const string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, BatchNormParam<DeviceType>,
-                                      BatchNormKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/beam_search_decode_op.cpp b/mobile/src/operators/beam_search_decode_op.cpp
deleted file mode 100644
index 1038234fe8ecf1b68f5fdcd52c1f1b4491658ae7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/beam_search_decode_op.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#include "operators/beam_search_decode_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BeamSearchDecodeOp<Dtype, T>::InferShape() const {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(beam_search_decode, ops::BeamSearchDecodeOp);
-#endif
-
-#endif  // BEAM_SEARCH_DECODE_OP
diff --git a/mobile/src/operators/beam_search_decode_op.h b/mobile/src/operators/beam_search_decode_op.h
deleted file mode 100644
index f212959474eade3da0f026bcdb1e3d15ddd30c6d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/beam_search_decode_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/beam_search_decode_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(BeamSearchDecode, BeamSearchDecodeParam,
-                 BeamSearchDecodeKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_DECODE_OP
diff --git a/mobile/src/operators/beam_search_op.cpp b/mobile/src/operators/beam_search_op.cpp
deleted file mode 100644
index 5f83e536672daf6ae83f07209c40e6f900a9514d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/beam_search_op.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#include "operators/beam_search_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BeamSearchOp<Dtype, T>::InferShape() const {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(beam_search, ops::BeamSearchOp);
-#endif
-
-#endif  // BEAM_SEARCH_OP
diff --git a/mobile/src/operators/beam_search_op.h b/mobile/src/operators/beam_search_op.h
deleted file mode 100644
index 985552d9f6efde5a474ca57672b8500bfc558e32..0000000000000000000000000000000000000000
--- a/mobile/src/operators/beam_search_op.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/beam_search_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(BeamSearch, BeamSearchParam, BeamSearchKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_OP
diff --git a/mobile/src/operators/bilinear_interp_op.cpp b/mobile/src/operators/bilinear_interp_op.cpp
deleted file mode 100644
index ef5d23087370f1daf551a1e7a945106810a71e84..0000000000000000000000000000000000000000
--- a/mobile/src/operators/bilinear_interp_op.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#include "operators/bilinear_interp_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void BilinearOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input(X) of BilinearInterOp should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output(Out) of BilinearInterOp should not be null.");
-
-  auto dim_x = this->param_.InputX()->dims();  // NCHW format
-  int out_h = this->param_.OutH();
-  int out_w = this->param_.OutW();
-  PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
-  bool ignore_scale = false;
-  if (out_h > 0 && out_w > 0) {
-    ignore_scale = true;
-  }
-  if (this->param_.InputOutPutSize() != nullptr) {
-    auto out_size_dim = this->param_.InputOutPutSize()->dims();
-
-    PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1,
-                          "OutSize's dimension size must be 1");
-    PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
-  }
-
-  if (this->param_.HasScale() && !ignore_scale) {
-    const float scale = this->param_.Scale();
-    DLOG << "scale_:  " << scale;
-    std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
-                                  static_cast<int>(dim_x[2] * scale),
-                                  static_cast<int>(dim_x[3] * scale)});
-    this->param_.Out()->Resize(framework::make_ddim(dim_out));
-    DLOG << "interp -- dim_out: " << dim_out;
-
-  } else {
-    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-    this->param_.Out()->Resize(framework::make_ddim(dim_out));
-    DLOG << "interp -- dim_out: " << dim_out;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(bilinear_interp, ops::BilinearOp);
-#endif
-
-#if PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(bilinear_interp, ops::BilinearOp)
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/bilinear_interp_op.h b/mobile/src/operators/bilinear_interp_op.h
deleted file mode 100644
index 2fee40859b071a00464f7a982f5bd4b8b2139df9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/bilinear_interp_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/bilinear_interp_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class BilinearOp : public framework::OperatorWithKernel<
-                       DeviceType, BilinearInterpParam<DeviceType>,
-                       operators::BilinearInterpKernel<DeviceType, T>> {
- public:
-  BilinearOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, BilinearInterpParam<DeviceType>,
-            operators::BilinearInterpKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/box_coder_op.cpp b/mobile/src/operators/box_coder_op.cpp
deleted file mode 100644
index 6511266e687ca1465da5681a2a68976b1bc5d049..0000000000000000000000000000000000000000
--- a/mobile/src/operators/box_coder_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#include "operators/box_coder_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void BoxCoderOp<Dtype, T>::InferShape() const {
-  auto input_priorbox_dims = this->param_.InputPriorBox()->dims();
-  auto input_priorboxvar_dims = this->param_.InputPriorBoxVar()->dims();
-  auto input_targetbox_dims = this->param_.InputTargetBox()->dims();
-
-  auto code_type = this->param_.CodeType();
-
-  if (code_type == "encode_center_size") {
-    if (input_targetbox_dims.size() != 2) {
-      LOG(kLOG_ERROR) << " The rank of Input of TargetBox must be 2";
-    }
-    if (input_targetbox_dims[1] != 4) {
-      LOG(kLOG_ERROR) << " The shape of TargetBox is [M, 4]";
-    }
-  }
-  if (code_type == "decode_center_size") {
-    if (input_targetbox_dims.size() != 3) {
-      LOG(kLOG_ERROR) << "The rank of Input of TargetBox must be 3";
-    }
-    if (input_targetbox_dims[1] != input_priorbox_dims[0] ||
-        input_targetbox_dims[2] != input_priorbox_dims[1]) {
-      LOG(kLOG_ERROR) << " dimension not match";
-    }
-  }
-  this->param_.OutputBox()->Resize(framework::make_ddim(
-      {input_targetbox_dims[0], input_priorbox_dims[0], 4}));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(box_coder, ops::BoxCoderOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(box_coder, ops::BoxCoderOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/box_coder_op.h b/mobile/src/operators/box_coder_op.h
deleted file mode 100644
index 417783ca939359929841d4e90e70d5ca9daf836b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/box_coder_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/box_coder_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class BoxCoderOp : public framework::OperatorWithKernel<
-                       DeviceType, BoxCoderParam<DeviceType>,
-                       operators::BoxCoderKernel<DeviceType, T>> {
- public:
-  BoxCoderOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, BoxCoderParam<DeviceType>,
-                                      operators::BoxCoderKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/cast_op.cpp b/mobile/src/operators/cast_op.cpp
deleted file mode 100644
index 70a3ff6646baea6679eb3eb5093afd48d73b1cb6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/cast_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CAST_OP
-
-#include "operators/cast_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void CastOp<DeviceType, T>::InferShape() const {
-  const auto &dims = this->param_.input_->dims();
-  this->param_.output_->Resize(dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(cast, ops::CastOp);
-#endif
-
-#endif  // CAST_OP
diff --git a/mobile/src/operators/cast_op.h b/mobile/src/operators/cast_op.h
deleted file mode 100644
index a244d5cfaff4d2cd5eb6c807138e263fa78e593f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/cast_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CAST_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class CastOp : public framework::OperatorWithKernel<
-                   DeviceType, CastParam<DeviceType>,
-                   operators::CastKernel<DeviceType, T>> {
- public:
-  CastOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, CastParam<DeviceType>,
-                                      operators::CastKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CAST_OP
diff --git a/mobile/src/operators/compare_op.cpp b/mobile/src/operators/compare_op.cpp
deleted file mode 100644
index 7332e33c6260ed70d096373cf1fe9fe6b6faca18..0000000000000000000000000000000000000000
--- a/mobile/src/operators/compare_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/compare_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LESS_THAN_OP
-template <typename Dtype, typename T>
-void LessThanOp<Dtype, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-template <typename Dtype, typename T>
-void EqualOp<Dtype, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef LESS_THAN_OP
-REGISTER_OPERATOR_CPU(less_than, ops::LessThanOp);
-#endif  // LESS_THAN_OP
-#ifdef EQUAL_OP
-REGISTER_OPERATOR_CPU(equal, ops::EqualOp);
-#endif  // EQUAL_OP
diff --git a/mobile/src/operators/compare_op.h b/mobile/src/operators/compare_op.h
deleted file mode 100644
index 5fbc350053abbc24ed956b8a7154adfe27b588c0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/compare_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/compare_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LESS_THAN_OP
-DECLARE_OPERATOR(LessThan, CompareParam, LessThanKernel);
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-DECLARE_OPERATOR(Equal, CompareParam, EqualKernel);
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/concat_op.cpp b/mobile/src/operators/concat_op.cpp
deleted file mode 100644
index 3f026a91ef6c075eb072fa4302945325b6933e1a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/concat_op.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include <vector>
-
-#include "operators/concat_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ConcatOp<Dtype, T>::InferShape() const {
-  auto inputs = this->param_.Inputs();
-  const size_t n = inputs.size();
-
-  std::vector<DDim> inputs_dims;
-  inputs_dims.reserve(n);
-  for (int i = 0; i < n; i++) {
-    inputs_dims.push_back(inputs[i]->dims());
-  }
-
-  if (n == 1) {
-    DLOG << "Warning: concat op have only one input, "
-            "may waste memory";
-  }
-
-  /// add all dim[axis] and check other dims if equal.
-  auto out_dims = inputs_dims[0];
-  auto axis = static_cast<size_t>(this->param_.Axis()) -
-              (this->param_.original_output_dims_size_ - out_dims.size());
-  int in_zero_dims_size = out_dims.size();
-  for (size_t i = 1; i < n; i++) {
-    for (size_t j = 0; j < in_zero_dims_size; j++) {
-      if (j == axis) {
-        out_dims[axis] += inputs_dims[i][j];
-      } else {
-        assert(out_dims[j] == inputs_dims[i][j]);
-      }
-    }
-  }
-
-  if (out_dims[axis] < 0) {
-    out_dims[axis] = -1;
-  }
-
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(concat, ops::ConcatOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(concat, ops::ConcatOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(concat, ops::ConcatOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/concat_op.h b/mobile/src/operators/concat_op.h
deleted file mode 100644
index 94c402cd8566485b8fbf5e5e96dce5a5afdcd0f3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/concat_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/concat_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ConcatOp : public framework::OperatorWithKernel<
-                     DeviceType, ConcatParam<DeviceType>,
-                     operators::ConcatKernel<DeviceType, T>> {
- public:
-  ConcatOp(const string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ConcatParam<DeviceType>,
-                                      operators::ConcatKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/conditional_block_op.cpp b/mobile/src/operators/conditional_block_op.cpp
deleted file mode 100644
index 0f1e6f7556c2960aa9c1eedb35e6e7aec20a12c9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/conditional_block_op.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#include "operators/conditional_block_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ConditionalBlockOp<Dtype, T>::InferShape() const {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(conditional_block, ops::ConditionalBlockOp);
-#endif
-
-#endif  // CONDITIONAL_BLOCK_OP
diff --git a/mobile/src/operators/conditional_block_op.h b/mobile/src/operators/conditional_block_op.h
deleted file mode 100644
index 8a5dfa5634c58a3ee08fbc20407dea7db5b515c0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/conditional_block_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/conditional_block_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(ConditionalBlock, ConditionalBlockParam,
-                 ConditionalBlockKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp b/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
deleted file mode 100644
index 0ea8ac01c65f974988a4fd42d8902cce6b888dc5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/controlflow/tensor_array_read_write_op.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/controlflow/tensor_array_read_write_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-template <typename Dtype, typename T>
-void WriteToArrayOp<Dtype, T>::InferShape() const {}
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-template <typename Dtype, typename T>
-void ReadFromArrayOp<Dtype, T>::InferShape() const {}
-#endif  // READ_FROM_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef WRITE_TO_ARRAY_OP
-REGISTER_OPERATOR_CPU(write_to_array, ops::WriteToArrayOp);
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-REGISTER_OPERATOR_CPU(read_from_array, ops::ReadFromArrayOp);
-#endif  // READ_FROM_ARRAY_OP
-#endif
diff --git a/mobile/src/operators/controlflow/tensor_array_read_write_op.h b/mobile/src/operators/controlflow/tensor_array_read_write_op.h
deleted file mode 100644
index 21d3ca10ef58780891f20e60d1f19ae0dcc39a23..0000000000000000000000000000000000000000
--- a/mobile/src/operators/controlflow/tensor_array_read_write_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/tensor_array_read_write_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-DECLARE_OPERATOR(WriteToArray, WriteToArrayParam, WriteToArrayKernel);
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-DECLARE_OPERATOR(ReadFromArray, ReadFromArrayParam, ReadFromArrayKernel);
-#endif  // WRITE_TO_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/controlflow/while_op.cpp b/mobile/src/operators/controlflow/while_op.cpp
deleted file mode 100644
index 06eb7c570999237310a5d6f9c40f678281385cf8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/controlflow/while_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/controlflow/while_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WHILE_OP
-template <typename Dtype, typename T>
-void WhileOp<Dtype, T>::InferShape() const {
-  // TODO(hjchen2)
-}
-#endif  // WHILE_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef WHILE_OP
-REGISTER_OPERATOR_CPU(while, ops::WhileOp);
-#endif  // WHILE_OP
-#endif
diff --git a/mobile/src/operators/controlflow/while_op.h b/mobile/src/operators/controlflow/while_op.h
deleted file mode 100644
index 6f753a08ef6268120085ef250719a4a9b5a04e8d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/controlflow/while_op.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/while_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WHILE_OP
-DECLARE_OPERATOR(While, WhileParam, WhileKernel);
-#endif  // WHILE_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/conv_op.cpp b/mobile/src/operators/conv_op.cpp
deleted file mode 100644
index 88c1262546bee8ec11c36b57d88fcfde080a407c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/conv_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/conv_op.h"
-#include <vector>
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ConvOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(conv2d, ops::ConvOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(conv2d, ops::ConvOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(conv2d, ops::ConvOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/conv_op.h b/mobile/src/operators/conv_op.h
deleted file mode 100644
index f023e60e72b4bee52dfae816f352f06ed5297196..0000000000000000000000000000000000000000
--- a/mobile/src/operators/conv_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ConvOp : public framework::OperatorWithKernel<
-                   DeviceType, ConvParam<DeviceType>,
-                   operators::ConvKernel<DeviceType, T>> {
- public:
-  ConvOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
-                                      operators::ConvKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- private:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/conv_transpose_op.cpp b/mobile/src/operators/conv_transpose_op.cpp
deleted file mode 100755
index 522337284f4c63de2683db37e0d72ff95315d1d5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/conv_transpose_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/conv_transpose_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(conv2d_transpose, ops::ConvOpTranspose);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(conv2d_transpose, ops::ConvOpTranspose);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(conv2d_transpose, ops::ConvOpTranspose);
-#endif
-
-#endif
diff --git a/mobile/src/operators/conv_transpose_op.h b/mobile/src/operators/conv_transpose_op.h
deleted file mode 100755
index ace1893311e9d8bfd3bbfe000687bbfa81174c2f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/conv_transpose_op.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/kernel/conv_transpose_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class ConvOpTranspose : public framework::OperatorWithKernel<
-                            DeviceType, ConvTransposeParam<DeviceType>,
-                            operators::ConvTransposeKernel<DeviceType, T>> {
- public:
-  ConvOpTranspose(const std::string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ConvTransposeParam<DeviceType>,
-            operators::ConvTransposeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-    std::vector<int> output_size = this->param_.OutputSize();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    if (output_size.size() == 2) {
-      output_shape.push_back(output_size[0]);
-      output_shape.push_back(output_size[1]);
-    } else {
-      for (size_t i = 0; i < strides.size(); ++i) {
-        auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-        output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                               2 * paddings[i] + filter_extent);
-      }
-    }
-
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- private:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/crf_op.cpp b/mobile/src/operators/crf_op.cpp
deleted file mode 100644
index 4ab299ebf4fff08ccfb9f0497d2883e2d9cbcc4b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/crf_op.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/crf_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void CrfOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputEmission(),
-                        "Input(Emission) should be not null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.InputTransition(),
-                        "Input(Transition) should be not null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.outputVBP(),
-                        "Input(ViterbiPath) should be not null.");
-
-  auto emission_dims = this->param_.InputEmission()->dims();
-  PADDLE_MOBILE_ENFORCE(emission_dims.size() == 2U,
-                        "The Input(Emission) should be a 2-D tensor.");
-  PADDLE_MOBILE_ENFORCE(emission_dims[0],
-                        "An empty mini-batch is not allowed.");
-
-  this->param_.outputVBP()->Resize(
-      {this->param_.InputEmission()->dims()[0], 1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(crf_decoding, ops::CrfOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/crf_op.h b/mobile/src/operators/crf_op.h
deleted file mode 100644
index fb0fd908898bb64bb84ee319d2b285de4739475b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/crf_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/crf_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class CrfOp : public framework::OperatorWithKernel<
-                  DeviceType, CrfParam<DeviceType>,
-                  operators::CrfKernel<DeviceType, T>> {
- public:
-  CrfOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, CrfParam<DeviceType>,
-                                      operators::CrfKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/depthwise_conv_op.cpp b/mobile/src/operators/depthwise_conv_op.cpp
deleted file mode 100644
index 5413af6ff7a107a34815050a51d04e7266e5801d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/depthwise_conv_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEPTHWISECONV_OP
-
-#include "operators/depthwise_conv_op.h"
-#include <vector>
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-#include "operators/conv_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void DepthwiseConvOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(depthwise_conv2d, ops::DepthwiseConvOp);
-#endif
-#endif
diff --git a/mobile/src/operators/depthwise_conv_op.h b/mobile/src/operators/depthwise_conv_op.h
deleted file mode 100644
index d1cbeeab06182814c298989cd6e4c25d38405252..0000000000000000000000000000000000000000
--- a/mobile/src/operators/depthwise_conv_op.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEPTHWISECONV_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DepthwiseConvOp : public framework::OperatorWithKernel<
-                            DeviceType, ConvParam<DeviceType>,
-                            operators::ConvKernel<DeviceType, T>> {
- public:
-  DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ConvParam<DeviceType>,
-                                      operators::ConvKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/dequantize_op.cpp b/mobile/src/operators/dequantize_op.cpp
deleted file mode 100644
index 1c04b3a95f0f961a61ac8dbfb932a024e878963b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/dequantize_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#include "operators/dequantize_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void DequantizeOp<DeviceType, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
-#endif
-
-#endif  // DEQUANT_OP
diff --git a/mobile/src/operators/dequantize_op.h b/mobile/src/operators/dequantize_op.h
deleted file mode 100644
index 81ab62bee8cff8ea37e4178ec266fec5b88174a0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/dequantize_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/dequantize_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DequantizeOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           DequantizeParam<DeviceType>,
-                                           DequantizeKernel<DeviceType, T>> {
- public:
-  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
-                                      DequantizeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // DEQUANT_OP
diff --git a/mobile/src/operators/detection_ops.cpp b/mobile/src/operators/detection_ops.cpp
deleted file mode 100644
index 50df7229e1e0530c726e741d2f18818616213327..0000000000000000000000000000000000000000
--- a/mobile/src/operators/detection_ops.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/detection_ops.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef ANCHOR_GENERATOR_OP
-template <typename DeviceType, typename T>
-void AnchorGeneratorOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_->dims();
-  // DLOG << "AnchorGenerator input dim =" << input_dims.size();
-  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
-  const auto &anchor_sizes = this->param_.anchor_sizes_;
-  const auto &aspect_ratios = this->param_.aspect_ratios_;
-
-  size_t num_anchors = aspect_ratios.size() * anchor_sizes.size();
-  std::vector<int64_t> dim_vec(4);
-  dim_vec[0] = input_dims[2];
-  dim_vec[1] = input_dims[3];
-  dim_vec[2] = num_anchors;
-  dim_vec[3] = 4;
-
-  this->param_.output_anchors_->Resize(framework::make_ddim(dim_vec));
-  this->param_.output_variances_->Resize(framework::make_ddim(dim_vec));
-}
-#endif
-
-#ifdef PROPOSAL_OP
-template <typename DeviceType, typename T>
-void ProposalOp<DeviceType, T>::InferShape() const {
-  this->param_.rpn_rois_->Resize(framework::make_ddim({-1, 4}));
-  this->param_.rpn_probs_->Resize(framework::make_ddim({-1, 1}));
-}
-#endif
-
-#ifdef PSROI_POOL_OP
-template <typename DeviceType, typename T>
-void PSRoiPoolOp<DeviceType, T>::InferShape() const {
-  const auto &rois_dims = this->param_.input_rois_->dims();
-  const int pooled_height = this->param_.pooled_height_;
-  const int pooled_width = this->param_.pooled_width_;
-  const int output_channels = this->param_.output_channels_;
-
-  auto out_dims = this->param_.input_x_->dims();
-  out_dims[0] = rois_dims[0];
-  out_dims[1] =
-      output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-  out_dims[2] = pooled_height;
-  out_dims[3] = pooled_width;
-  this->param_.output_->Resize(out_dims);
-}
-#endif
-
-#ifdef ROIALIGN_POOL_OP
-template <typename DeviceType, typename T>
-void RoiAlignPoolOp<DeviceType, T>::InferShape() const {
-  const auto &rois_dims = this->param_.input_rois_->dims();
-  const int pooled_height = this->param_.pooled_height_;
-  const int pooled_width = this->param_.pooled_width_;
-
-  auto out_dims = this->param_.input_x_->dims();
-  out_dims[0] = rois_dims[0];
-  // out_dims[1] =
-  //     output_channels;  // input_dims[1] / (pooled_height * pooled_width);
-  out_dims[2] = pooled_height;
-  out_dims[3] = pooled_width;
-  this->param_.output_->Resize(out_dims);
-}
-#endif
-
-#ifdef ROI_PERSPECTIVE_OP
-template <typename DeviceType, typename T>
-void RoiPerspectiveOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  const auto &rois_dims = this->param_.input_rois_->dims();
-  const int transformed_height = this->param_.transformed_height_;
-  const int transformed_width = this->param_.transformed_width_;
-  std::vector<int64_t> out_dims_v({rois_dims[0],   // num_rois
-                                   input_dims[1],  // channels
-                                   static_cast<int64_t>(transformed_height),
-                                   static_cast<int64_t>(transformed_width)});
-  auto out_dims = framework::make_ddim(out_dims_v);
-  this->param_.output_->Resize(out_dims);
-
-  std::vector<int64_t> mask_dims_v({rois_dims[0],  // num_rois
-                                    1,             // channels
-                                    static_cast<int64_t>(transformed_height),
-                                    static_cast<int64_t>(transformed_width)});
-  auto mask_dims = framework::make_ddim(mask_dims_v);
-
-  std::vector<int64_t> matrix_dims_v({rois_dims[0], 9});
-  auto matrix_dims = framework::make_ddim(matrix_dims_v);
-  this->param_.transform_Matrix_->Resize(matrix_dims);
-  this->param_.mask->Resize(mask_dims);
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-#ifdef ANCHOR_GENERATOR_OP
-REGISTER_OPERATOR_CPU(anchor_generator, ops::AnchorGeneratorOp);
-#endif
-#ifdef PROPOSAL_OP
-REGISTER_OPERATOR_CPU(generate_proposals, ops::ProposalOp);
-#endif
-#ifdef PSROI_POOL_OP
-REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp);
-#endif
-#ifdef ROI_PERSPECTIVE_OP
-REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp);
-#endif
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#ifdef ANCHOR_GENERATOR_OP
-REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp);
-#endif
-#ifdef PROPOSAL_OP
-REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp);
-#endif
-#ifdef PSROI_POOL_OP
-REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp);
-#endif
-#ifdef ROIALIGN_POOL_OP
-REGISTER_OPERATOR_FPGA(roialign_pool, ops::RoiAlignPoolOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/detection_ops.h b/mobile/src/operators/detection_ops.h
deleted file mode 100644
index 3b3a54dc4ba2e99eabe2250de63f38c7c7744d47..0000000000000000000000000000000000000000
--- a/mobile/src/operators/detection_ops.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/detection_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef ANCHOR_GENERATOR_OP
-DECLARE_OPERATOR(AnchorGenerator, AnchorGeneratorParam, AnchorGeneratorKernel);
-#endif
-
-#ifdef PROPOSAL_OP
-DECLARE_OPERATOR(Proposal, ProposalParam, ProposalKernel);
-#endif
-
-#ifdef PSROI_POOL_OP
-DECLARE_OPERATOR(PSRoiPool, PSRoiPoolParam, PSRoiPoolKernel);
-#endif
-
-#ifdef ROIALIGN_POOL_OP
-DECLARE_OPERATOR(RoiAlignPool, RoiAlignPoolParam, RoiAlignPoolKernel);
-#endif
-
-#ifdef ROI_PERSPECTIVE_OP
-DECLARE_OPERATOR(RoiPerspective, RoiPerspectiveParam, RoiPerspectiveKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/dropout_op.cpp b/mobile/src/operators/dropout_op.cpp
deleted file mode 100644
index c0dafa424ea381f088df7a0f796bc57661286fee..0000000000000000000000000000000000000000
--- a/mobile/src/operators/dropout_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-#include "operators/dropout_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void DropoutOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(dropout, ops::DropoutOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/dropout_op.h b/mobile/src/operators/dropout_op.h
deleted file mode 100644
index 132b94af692d9d8f3cb2fa4b146e8265893db231..0000000000000000000000000000000000000000
--- a/mobile/src/operators/dropout_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/dropout_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class DropoutOp : public framework::OperatorWithKernel<
-                      DeviceType, DropoutParam<DeviceType>,
-                      operators::DropoutKernel<DeviceType, T>> {
- public:
-  DropoutOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs, const framework::AttributeMap attrs,
-            framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, DropoutParam<DeviceType>,
-                                      operators::DropoutKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/elementwise_add_op.cpp b/mobile/src/operators/elementwise_add_op.cpp
deleted file mode 100644
index 1f198aeb030c5a9c05f9837a551c494a6b990487..0000000000000000000000000000000000000000
--- a/mobile/src/operators/elementwise_add_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/elementwise_add_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ElementwiseAddOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(elementwise_add, ops::ElementwiseAddOp);
-#endif
-
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(elementwise_add, ops::ElementwiseAddOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/elementwise_add_op.h b/mobile/src/operators/elementwise_add_op.h
deleted file mode 100644
index 7819765813e463a4d916a95c3882768603471fbf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/elementwise_add_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "kernel/elementwise_add_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ElementwiseAddOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseAddParam<DeviceType>,
-                             operators::ElementwiseAddKernel<DeviceType, T>> {
- public:
-  ElementwiseAddOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseAddParam<DeviceType>,
-            operators::ElementwiseAddKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/elementwise_mul_op.cpp b/mobile/src/operators/elementwise_mul_op.cpp
deleted file mode 100644
index 48b2a4c282c3527460baa4b321badcae89783b5d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/elementwise_mul_op.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/elementwise_mul_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ElementwiseMulOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(elementwise_mul, ops::ElementwiseMulOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/elementwise_mul_op.h b/mobile/src/operators/elementwise_mul_op.h
deleted file mode 100644
index 53a90180b69124fddf4ce5091be0d169c9dcdfb1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/elementwise_mul_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "kernel/elementwise_mul_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ElementwiseMulOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseMulParam<DeviceType>,
-                             operators::ElementwiseMulKernel<DeviceType, T>> {
- public:
-  ElementwiseMulOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseMulParam<DeviceType>,
-            operators::ElementwiseMulKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ElementwiseMulParam<DeviceType>,
-      operators::ElementwiseMulKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/elementwise_sub_op.cpp b/mobile/src/operators/elementwise_sub_op.cpp
deleted file mode 100644
index 6962e69a8de5522aeff912fe84484e36879300d4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#include "operators/elementwise_sub_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ElementwiseSubOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/elementwise_sub_op.h b/mobile/src/operators/elementwise_sub_op.h
deleted file mode 100644
index ce3b310ef334356b54d4e03f2df90eeef4303e5f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/elementwise_sub_op.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "kernel/elementwise_sub_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class ElementwiseSubOp : public framework::OperatorWithKernel<
-                             DeviceType, ElementwiseSubParam<DeviceType>,
-                             operators::ElementwiseSubKernel<DeviceType, T>> {
- public:
-  ElementwiseSubOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseSubParam<DeviceType>,
-            operators::ElementwiseSubKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, ElementwiseSubParam<DeviceType>,
-      operators::ElementwiseSubKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/exp_op.cpp b/mobile/src/operators/exp_op.cpp
deleted file mode 100644
index 549108d72e6d5ab65b803870b6994b43b5a2f1db..0000000000000000000000000000000000000000
--- a/mobile/src/operators/exp_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef EXP_OP
-#include "exp_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void EXPOp<DeviceType, T>::InferShape() const {
-  auto shape = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(shape);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(exp, ops::EXPOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(exp, ops::EXPOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/exp_op.h b/mobile/src/operators/exp_op.h
deleted file mode 100644
index 6f8cd099b7246963678a5cb98b8d4d73d26ba5ff..0000000000000000000000000000000000000000
--- a/mobile/src/operators/exp_op.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/exp_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef EXP_OP
-DECLARE_OPERATOR(EXP, EXPParam, EXPKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp
deleted file mode 100644
index e1d8b76fd6299cadfd3977f8e804c7aa0e7b5cc6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/expand_op.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef EXPAND_OP
-
-#include "operators/expand_op.h"
-#include <framework/ddim.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ExpandOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-
-  int expand_size = this->param_.expand_times.size();
-  int x_dims_size = x_dim.size();
-  PADDLE_MOBILE_ENFORCE(expand_size == x_dims_size,
-                        "The number of expand_times size must be qual to the "
-                        "rank of Input(X). The number of expand_times size "
-                        "must be qual to the rank of Input(X).")
-
-  framework::DDim out_dims(this->param_.InputX()->dims());
-  for (size_t i = 0; i < this->param_.expand_times.size(); ++i) {
-    out_dims[i] *= this->param_.expand_times[i];
-  }
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(expand, ops::ExpandOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h
deleted file mode 100644
index d504000079bc79564c4f58e0133d37ff8634e5c4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/expand_op.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef EXPAND_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/expand_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef EXPAND_OP
-DECLARE_OPERATOR(Expand, ExpandParam, ExpandKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/feed_op.cpp b/mobile/src/operators/feed_op.cpp
deleted file mode 100644
index ffd253073a82d3ff7a0f1e4fca268f8ec24a9145..0000000000000000000000000000000000000000
--- a/mobile/src/operators/feed_op.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/feed_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void FeedOp<DeviceType, T>::InferShape() const {
-  auto out_dims = this->param_.Out()->dims();
-  out_dims[0] = this->param_.BatchSize();
-  int col = this->param_.Col();
-  auto input_dims = this->param_.InputX()->at(col).dims();
-  this->param_.Out()->Resize(input_dims);
-  if (input_dims.size() == 4 || input_dims.size() == 2) {
-    this->param_.Out()->Resize(input_dims);
-  } else {
-    this->param_.Out()->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(feed, ops::FeedOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(feed, ops::FeedOp);
-#endif
diff --git a/mobile/src/operators/feed_op.h b/mobile/src/operators/feed_op.h
deleted file mode 100644
index fda259b58556f55cd0e98af71295f4827be80006..0000000000000000000000000000000000000000
--- a/mobile/src/operators/feed_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/feed_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using std::string;
-
-template <typename DeviceType, typename T>
-class FeedOp
-    : public framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
-                                           FeedKernel<DeviceType, T>> {
- public:
-  FeedOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap attrs,
-         framework::Scope *scope)
-
-      : framework::OperatorWithKernel<DeviceType, FeedParam<DeviceType>,
-                                      FeedKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fetch_op.cpp b/mobile/src/operators/fetch_op.cpp
deleted file mode 100644
index 104e8214a08d9b980d32a5d425d82c1783b92310..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fetch_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/fetch_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void FetchOp<DeviceType, T>::InferShape() const {
-  int col = this->param_.Col();
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->at(col).Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fetch, ops::FetchOp);
-#endif
-
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fetch, ops::FetchOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fetch, ops::FetchOp);
-#endif
diff --git a/mobile/src/operators/fetch_op.h b/mobile/src/operators/fetch_op.h
deleted file mode 100644
index 72c8e1997f54c85e5a59c1ec318cf09230a66196..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fetch_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/fetch_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-
-template <typename DeviceType, typename T>
-class FetchOp
-    : public framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
-                                           FetchKernel<DeviceType, T>> {
- public:
-  FetchOp(const string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FetchParam<DeviceType>,
-                                      FetchKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fill_constant_batch_size_like_op.cpp b/mobile/src/operators/fill_constant_batch_size_like_op.cpp
deleted file mode 100644
index 848ab436f288f0b0a54ece8ada9e7b0f2c7010bd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fill_constant_batch_size_like_op.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-
-#include "operators/fill_constant_batch_size_like_op.h"
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fill_constant_batch_size_like,
-                      ops::FillConstantBatchSizeLikeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fill_constant_batch_size_like_op.h b/mobile/src/operators/fill_constant_batch_size_like_op.h
deleted file mode 100644
index dff76d85d10b6ca1ba538f1b62f34100c8668de5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fill_constant_batch_size_like_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-
-#pragma once
-
-#include <algorithm>
-#include <string>
-#include "framework/data_type.h"
-#include "framework/operator.h"
-#include "framework/selected_rows.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FillConstantBatchSizeLikeOp : public framework::OperatorBase<DeviceType> {
- public:
-  FillConstantBatchSizeLikeOp(const std::string &type,
-                              const VariableNameMap &inputs,
-                              const VariableNameMap &outputs,
-                              const framework::AttributeMap attrs,
-                              framework::Scope *scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, scope) {}
-  void RunImpl() {
-    auto data_type =
-        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
-            param_.DataDtype());
-    framework::Tensor *tensor = nullptr;
-    auto value = param_.Value();
-    auto *outvar = param_.OutVar();
-
-    if (outvar->template IsType<framework::LoDTensor>()) {
-      tensor = outvar->template GetMutable<framework::LoDTensor>();
-    } else if (outvar->template IsType<framework::SelectedRows>()) {
-      tensor = outvar->template GetMutable<framework::SelectedRows>()
-                   ->mutable_value();
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "fill constant batch size like op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    auto shape = param_.Shape();
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    auto ddim = framework::make_ddim(shape_int64);
-    ddim[param_.OutputDimIdx()] = param_.Input()->dims()[param_.InputDimIdx()];
-    tensor->Resize(ddim);
-    tensor->mutable_data(framework::ToTypeIndex(data_type));
-
-    math::SetConstant(tensor, value);
-  }
-
-  void Init() {}
-
-  void InferShape() const {
-    PADDLE_MOBILE_ENFORCE(
-        param_.Out() != nullptr,
-        "Output (Out) of fill_constant_batch_size_like op should not be null.");
-
-    auto shape = param_.Shape();
-
-    std::vector<int64_t> shape_int64(shape.size(), 0);
-    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
-                   [](int a) { return static_cast<int64_t>(a); });
-    DLOG << shape_int64;
-    auto ddim = framework::make_ddim(shape_int64);
-    ddim[param_.OutputDimIdx()] = param_.Input()->dims()[param_.InputDimIdx()];
-    param_.Out()->Resize(ddim);
-  }
-
- protected:
-  FillConstantBatchSizeLikeParam<DeviceType> param_;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fill_constant_op.cpp b/mobile/src/operators/fill_constant_op.cpp
deleted file mode 100644
index 0c13c57ceb53933c750f8c1adaa8b4e24ff948c8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fill_constant_op.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_OP
-
-#include "operators/fill_constant_op.h"
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fill_constant, ops::FillConstantOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fill_constant, ops::FillConstantOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fill_constant_op.h b/mobile/src/operators/fill_constant_op.h
deleted file mode 100644
index 0a51f8494d56b2490f073bfee9a71950f6075647..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fill_constant_op.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FILL_CONSTANT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/data_type.h"
-#include "framework/operator.h"
-#include "framework/selected_rows.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FillConstantOp : public framework::OperatorBase<DeviceType> {
- public:
-  FillConstantOp(const std::string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap attrs, framework::Scope *scope)
-      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
-                                            scope),
-        param_(inputs, outputs, attrs, scope) {}
-  void RunImpl() {
-    auto data_type =
-        static_cast<_PaddleMobile__Framework__Proto__VarType__Type>(
-            param_.DataDtype());
-    framework::Tensor *tensor = nullptr;
-    auto value = param_.Value();
-    auto *outvar = param_.OutVar();
-
-    if (outvar->template IsType<framework::LoDTensor>()) {
-      tensor = outvar->template GetMutable<framework::LoDTensor>();
-    } else if (outvar->template IsType<framework::SelectedRows>()) {
-      tensor = outvar->template GetMutable<framework::SelectedRows>()
-                   ->mutable_value();
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "fill constant op's output only"
-          "supports SelectedRows and LoDTensor");
-    }
-    tensor->Resize(framework::make_ddim(param_.Shape()));
-    tensor->mutable_data(framework::ToTypeIndex(data_type));
-
-    math::SetConstant(tensor, value);
-  }
-
-  void Init() {}
-
-  void InferShape() const {
-    PADDLE_MOBILE_ENFORCE(
-        param_.Out() != nullptr,
-        "Output (Out) of fill_constant op should not be null.");
-    framework::DDim ddim = framework::make_ddim(param_.Shape());
-    param_.Out()->Resize(ddim);
-  }
-
- protected:
-  FillConstantParam<DeviceType> param_;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/flatten2_op.cpp b/mobile/src/operators/flatten2_op.cpp
deleted file mode 100644
index 78e933e27889b493fc3a443bc0674ed41d47950e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/flatten2_op.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN2_OP
-#include "operators/flatten2_op.h"
-#include <operators/kernel/reshape_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void Flatten2Op<DeviceType, T>::InferShape() const {
-  const auto* input = this->param_.InputX();
-  auto* output = this->param_.Out();
-  auto input_x_dims = input->dims();
-  if (input->dims().size() == 4) {
-    PADDLE_MOBILE_ENFORCE(this->param_.Axis() == 1,
-                          "flatten 2 only support axis == 1");
-    if (this->param_.Axis() == 1) {
-      std::vector<int> temp_output_dims(2);
-      temp_output_dims[0] = input->dims()[0];
-      temp_output_dims[1] =
-          input->dims()[1] * input->dims()[2] * input->dims()[3];
-      output->Resize(framework::make_ddim(temp_output_dims));
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(flatten2, ops::Flatten2Op);
-#endif
-
-#endif
diff --git a/mobile/src/operators/flatten2_op.h b/mobile/src/operators/flatten2_op.h
deleted file mode 100644
index 9c08e9c3351381c8163f5e9dc90a00ec7ed50a56..0000000000000000000000000000000000000000
--- a/mobile/src/operators/flatten2_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN2_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/kernel/flatten2_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Flatten2, FlattenParam, Flatten2Kernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/flatten_op.cpp b/mobile/src/operators/flatten_op.cpp
deleted file mode 100644
index 4e52485345b4f891738e1de147e53746e03928c3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/flatten_op.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#include "operators/flatten_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void FlattenOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input (X) of Flatten op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output (Output) of Flatten op should not be null.");
-
-  auto &axis = this->param_.Axis();
-  PADDLE_MOBILE_ENFORCE(axis >= 0,
-                        "The axis should be greater than or equal to 0.");
-
-  auto &in_dims = this->param_.InputX()->dims();
-  PADDLE_MOBILE_ENFORCE(
-      axis <= in_dims.size(),
-      "The axis should be less than or equal to input tensor's rank.");
-
-  const auto &out_dims = GetOutputShape(axis, in_dims);
-  this->param_.Out()->Resize(framework::make_ddim(out_dims));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp);
-REGISTER_OPERATOR_CPU(flatten2, ops::Flatten2Op);
-#endif
-
-#endif  // FLATTEN_OP
diff --git a/mobile/src/operators/flatten_op.h b/mobile/src/operators/flatten_op.h
deleted file mode 100644
index ef97994dc16cb96eb277e7cdef565669322f7a8c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/flatten_op.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/kernel/flatten_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline std::vector<int32_t> GetOutputShape(const int axis,
-                                           const framework::DDim &in_dims) {
-  int64_t outer = 1, inner = 1;
-  for (int i = 0; i < in_dims.size(); ++i) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-  std::vector<int32_t> out_shape(2);
-  out_shape[0] = static_cast<int>(outer);
-  out_shape[1] = static_cast<int>(inner);
-  return out_shape;
-}
-
-template <typename DeviceType, typename T>
-class FlattenOp : public framework::OperatorWithKernel<
-                      DeviceType, FlattenParam<DeviceType>,
-                      operators::FlattenKernel<DeviceType, T>> {
- public:
-  FlattenOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FlattenParam<DeviceType>,
-                                      operators::FlattenKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-template <typename DeviceType, typename T>
-class Flatten2Op : public FlattenOp<DeviceType, T> {
- public:
-  Flatten2Op(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : FlattenOp<DeviceType, T>(type, inputs, outputs, attrs, scope) {}
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_op.cpp b/mobile/src/operators/fusion_conv_add_bn_op.cpp
deleted file mode 100644
index 27e3c04d62c29abe69adef7457bc633d294e2cdc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_bn_op.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/fusion_conv_add_bn_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddBNOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_bn, ops::FusionConvAddBNMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_op.h b/mobile/src/operators/fusion_conv_add_bn_op.h
deleted file mode 100644
index 0618f8051286f634c71984a634de399cda2ffec1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_bn_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvAddBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddBNOp : public framework::OperatorWithKernel<
-                              DeviceType, FusionConvAddBNParam<DeviceType>,
-                              operators::ConvAddBNKernel<DeviceType, T>> {
- public:
-  FusionConvAddBNOp(const string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddBNParam<DeviceType>,
-            operators::ConvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
-                                                       attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp b/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
deleted file mode 100644
index 4cf7e7011253fc407ab28e4ec8b08fe13dd419c2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
-                        ops::FusionConvAddBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
-#endif
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_bn_relu_op.h b/mobile/src/operators/fusion_conv_add_bn_relu_op.h
deleted file mode 100644
index 9dd2fd406a6696310a73369c33d55f4d92e2b50c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_bn_relu_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvAddBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvAddBNReluParam<DeviceType>,
-          operators::ConvAddBNReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddBNReluOp(const string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddBNReluParam<DeviceType>,
-            operators::ConvAddBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_op.cpp b/mobile/src/operators/fusion_conv_add_op.cpp
deleted file mode 100644
index c611f1084f569cc03a9183023cfb5cbd8f5c2a50..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/fusion_conv_add_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_add, ops::FusionConvAddOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_add, ops::FusionConvAddOp);
-#endif
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_op.h b/mobile/src/operators/fusion_conv_add_op.h
deleted file mode 100644
index 22ecab45e694278d22e3a736e415b5a34af623da..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvAddMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddOp : public framework::OperatorWithKernel<
-                            DeviceType, FusionConvAddParam<DeviceType>,
-                            operators::ConvAddKernel<DeviceType, T>> {
- public:
-  FusionConvAddOp(const string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType,
-                                      FusionConvAddParam<DeviceType>,
-                                      operators::ConvAddKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_relu_op.cpp b/mobile/src/operators/fusion_conv_add_relu_op.cpp
deleted file mode 100644
index d827d845e16274ad43674860d5cf360e159f3f58..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_relu_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/fusion_conv_add_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvAddReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_add_relu, ops::FusionConvAddReluOp);
-#endif
-#endif
diff --git a/mobile/src/operators/fusion_conv_add_relu_op.h b/mobile/src/operators/fusion_conv_add_relu_op.h
deleted file mode 100644
index 7a1cfd19414a18e727fb2003b603a611ebc603e5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_add_relu_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvAddReluOpMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvAddReluOpMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_ADD_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvAddReluOp : public framework::OperatorWithKernel<
-                                DeviceType, FusionConvAddReluParam<DeviceType>,
-                                operators::ConvAddReluKernel<DeviceType, T>> {
- public:
-  FusionConvAddReluOp(const string &type, const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvAddReluParam<DeviceType>,
-            operators::ConvAddReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp b/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
deleted file mode 100644
index 759c0df8d480b2c2e5fb50554ecef19862052586..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
-                        ops::FusionConvBNAddReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_add_relu_op.h b/mobile/src/operators/fusion_conv_bn_add_relu_op.h
deleted file mode 100644
index 676d30ce2698154b104c9dc100ea404d1cc0ba73..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_bn_add_relu_op.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_bn_add_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvBNAddReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvBNAddReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"},
-                    {"Y", "BNY"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_ADD_RELU; }
-  std::vector<std::pair<int, std::string>> NeedCheck() {
-    DLOG << " conv bn add relu check add X ";
-    return {{2, "Y"}, {2, "X"}};
-  }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvBNAddReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionConvBNAddReluParam<DeviceType>,
-          operators::ConvBNAddReluKernel<DeviceType, T>> {
- public:
-  FusionConvBNAddReluOp(const string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvBNAddReluParam<DeviceType>,
-            operators::ConvBNAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_op.cpp b/mobile/src/operators/fusion_conv_bn_op.cpp
deleted file mode 100644
index 3c6fa5b1a3c11b8a3dd14180cf4d6c356621b3f9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_bn_op.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/fusion_conv_bn_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvBNOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_op.h b/mobile/src/operators/fusion_conv_bn_op.h
deleted file mode 100644
index 385bb539fd9f5bb1e1e94e8e26830538daed5297..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_bn_op.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvBNOp : public framework::OperatorWithKernel<
-                           DeviceType, FusionConvBNParam<DeviceType>,
-                           operators::ConvBNKernel<DeviceType, T>> {
- public:
-  FusionConvBNOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FusionConvBNParam<DeviceType>,
-                                      operators::ConvBNKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_relu_op.cpp b/mobile/src/operators/fusion_conv_bn_relu_op.cpp
deleted file mode 100644
index 4561ec7b937d03c1c7a5d3a65e212885a19b47c1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_bn_relu_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvBNReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_bn_relu, ops::FusionConvBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_bn_relu_op.h b/mobile/src/operators/fusion_conv_bn_relu_op.h
deleted file mode 100644
index 2f49df081cef8d5d6ac785f7512e3737d9c7593d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_bn_relu_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionConvBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvBNReluOp : public framework::OperatorWithKernel<
-                               DeviceType, FusionConvBNReluParam<DeviceType>,
-                               operators::ConvBNReluKernel<DeviceType, T>> {
- public:
-  FusionConvBNReluOp(const string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs,
-                     framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionConvBNReluParam<DeviceType>,
-            operators::ConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                        attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_relu_op.cpp b/mobile/src/operators/fusion_conv_relu_op.cpp
deleted file mode 100644
index d403ceae2f9310ad8308ff43de237285a30db057..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_relu_op.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#include "operators/fusion_conv_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionConvReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_conv_relu, ops::FusionConvReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_conv_relu, ops::FusionConvReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_conv_relu, ops::FusionConvReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_conv_relu, ops::FusionConvReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_conv_relu_op.h b/mobile/src/operators/fusion_conv_relu_op.h
deleted file mode 100644
index 6444b6b739be62107caa13fedfa3bbcaa7930a81..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_conv_relu_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/conv_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionConvReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionConvReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_CONV_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionConvReluOp : public framework::OperatorWithKernel<
-                             DeviceType, FusionConvReluParam<DeviceType>,
-                             operators::ConvReluKernel<DeviceType, T>> {
- public:
-  FusionConvReluOp(const string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType,
-                                      FusionConvReluParam<DeviceType>,
-                                      operators::ConvReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_bn_op.cpp b/mobile/src/operators/fusion_deconv_add_bn_op.cpp
deleted file mode 100644
index e83e29d2eaf341faf178c5aa1b5b522407c17468..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_op.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#include "operators/fusion_deconv_add_bn_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_bn_op.h b/mobile/src/operators/fusion_deconv_add_bn_op.h
deleted file mode 100644
index 618545044136e42e750fd4c71ce96bd861954b71..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_op.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADDBN_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_bn_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"},
-                    {"Y", "BNY"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddBNOp : public framework::OperatorWithKernel<
-                                DeviceType, FusionDeconvAddBNParam<DeviceType>,
-                                operators::DeconvAddBNKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs,
-                      const VariableNameMap &outputs,
-                      const framework::AttributeMap &attrs,
-                      framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddBNParam<DeviceType>,
-            operators::DeconvAddBNKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_BN_OP
diff --git a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp b/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
deleted file mode 100755
index 9f3ca09c3e3e7b0136c1c769540469f7eede74ab..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#include "operators/fusion_deconv_add_bn_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu,
-                        ops::FusionDeconvAddBNReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h b/mobile/src/operators/fusion_deconv_add_bn_relu_op.h
deleted file mode 100644
index 1c6cfd7318e48cad16e1d274b5724c832c70d8c8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_bn_relu_op.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADDBNRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_bn_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"},
-                    {"Y", "BNY"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
-          operators::DeconvAddBNReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const framework::AttributeMap &attrs,
-                          framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddBNReluParam<DeviceType>,
-            operators::DeconvAddBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_BN_RELU_OP
diff --git a/mobile/src/operators/fusion_deconv_add_op.cpp b/mobile/src/operators/fusion_deconv_add_op.cpp
deleted file mode 100644
index 717039cd3db66c7af0e9d6d0fd16d8607b5d6bed..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_op.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#include "operators/fusion_deconv_add_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add, ops::FusionDeconvAddMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add, ops::FusionDeconvAddOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_op.h b/mobile/src/operators/fusion_deconv_add_op.h
deleted file mode 100644
index 406f81318a28889f066b03eb6cfeb954939b0f1a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_op.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADD_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddOp : public framework::OperatorWithKernel<
-                              DeviceType, FusionDeconvAddParam<DeviceType>,
-                              operators::DeconvAddKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddOp(const string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddParam<DeviceType>,
-            operators::DeconvAddKernel<DeviceType, T>>(type, inputs, outputs,
-                                                       attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_OP
diff --git a/mobile/src/operators/fusion_deconv_add_relu_op.cpp b/mobile/src/operators/fusion_deconv_add_relu_op.cpp
deleted file mode 100644
index a461bce2efd27ebff50f705137e88970579ff62d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_relu_op.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#include "operators/fusion_deconv_add_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_add_relu,
-                        ops::FusionDeconvAddReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_add_relu, ops::FusionDeconvAddReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_add_relu_op.h b/mobile/src/operators/fusion_deconv_add_relu_op.h
deleted file mode 100644
index 735e126b033c5872ed66900acc5c56dd76b8ad85..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_add_relu_op.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVADDRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvAddReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvAddReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvAddReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDeconvAddReluParam<DeviceType>,
-          operators::DeconvAddReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvAddReluOp(const string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvAddReluParam<DeviceType>,
-            operators::DeconvAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_ADD_RELU_OP
diff --git a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp b/mobile/src/operators/fusion_deconv_bn_relu_op.cpp
deleted file mode 100644
index 207acd93802e07e5891c07c3a72b701fb0e77fca..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_bn_relu_op.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#include "operators/fusion_deconv_bn_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_deconv_bn_relu, ops::FusionDeconvBNReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_bn_relu, ops::FusionDeconvBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_bn_relu_op.h b/mobile/src/operators/fusion_deconv_bn_relu_op.h
deleted file mode 100644
index 92bb97445d1442056843efb1fd66fa3fb1e54511..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_bn_relu_op.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVBNRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_bn_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DECONV_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDeconvBNReluParam<DeviceType>,
-          operators::DeconvBNReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvBNReluOp(const string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvBNReluParam<DeviceType>,
-            operators::DeconvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_DECONV_BN_RELU_OP
diff --git a/mobile/src/operators/fusion_deconv_relu_op.cpp b/mobile/src/operators/fusion_deconv_relu_op.cpp
deleted file mode 100644
index 7c48c4f14caa310b7ddd2d1414e19c1586cfe7a6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_relu_op.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVRELU_OP
-
-#include "operators/fusion_deconv_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(fusion_deconv_relu, ops::FusionDeconvReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_deconv_relu_op.h b/mobile/src/operators/fusion_deconv_relu_op.h
deleted file mode 100644
index c290a8da081591d0eddf3ef075ae57d3869b7725..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_deconv_relu_op.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_DECONVRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/deconv_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDeconvReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDeconvReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDeconvReluOp : public framework::OperatorWithKernel<
-                               DeviceType, FusionDeconvReluParam<DeviceType>,
-                               operators::DeconvReluKernel<DeviceType, T>> {
- public:
-  FusionDeconvReluOp(const string &type, const VariableNameMap &inputs,
-                     const VariableNameMap &outputs,
-                     const framework::AttributeMap &attrs,
-                     framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDeconvReluParam<DeviceType>,
-            operators::DeconvReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                        attrs, scope) {}
-
-  void InferShape() const {
-    auto input = this->param_.Input();
-    auto in_dims = input->dims();
-
-    auto filter = this->param_.Filter();
-    auto filter_dims = filter->dims();
-
-    std::vector<int> strides = this->param_.Strides();
-    std::vector<int> paddings = this->param_.Paddings();
-    std::vector<int> dilations = this->param_.Dilations();
-
-    int groups = this->param_.Groups();
-
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == 4 || in_dims.size() == 5,
-        "ConvTransposeOp intput should be 4-D or 5-D tensor.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() == filter_dims.size(),
-        "ConvTransposeOp input dimension and filter dimension "
-        "should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims.size() - strides.size() == 2U,
-        "ConvTransposeOp input dimension and strides dimension should "
-        "be consistent.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(),
-                          "ConvTransposeOp paddings dimension and strides "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(),
-                          "ConvTransposeOp paddings dimension and dilations "
-                          "dimension should be the same.");
-    PADDLE_MOBILE_ENFORCE(
-        in_dims[1] == filter_dims[0],
-        "In ConvTransposeOp, The number of input channels should "
-        "be equal to the number of filter's channels.");
-
-    std::vector<int64_t> output_shape({in_dims[0], filter_dims[1] * groups});
-    for (size_t i = 0; i < strides.size(); ++i) {
-      auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
-      output_shape.push_back((in_dims[i + 2] - 1) * strides[i] -
-                             2 * paddings[i] + filter_extent);
-    }
-    this->param_.Output()->Resize(framework::make_ddim(output_shape));
-  }
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_FC_RELU_OP
diff --git a/mobile/src/operators/fusion_dequant_add_bn_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_op.cpp
deleted file mode 100644
index 4df50af22b0dc9e214b0cabe303bf70edf50c307..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-
-#include "operators/fusion_dequant_add_bn_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn, ops::FusionDequantAddBNMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn, ops::FusionDequantAddBNOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_op.h b/mobile/src/operators/fusion_dequant_add_bn_op.h
deleted file mode 100644
index b838b544ce249029bfdbad77f62c8f393006ebd2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_op.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionDequantAddBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNParam<DeviceType>,
-          operators::FusionDequantAddBNKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNParam<DeviceType>,
-            operators::FusionDequantAddBNKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
deleted file mode 100644
index 80d9040afb29b7a42c742b821e9d7522c1a12827..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-
-#include "operators/fusion_dequant_add_bn_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNReluOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu,
-                        ops::FusionDequantAddBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu,
-                      ops::FusionDequantAddBNReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h b/mobile/src/operators/fusion_dequant_add_bn_relu_op.h
deleted file mode 100644
index e2762923c511858d5cea77f3301919d4c0b8fa4b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_op.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionDequantAddBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNParam<DeviceType>,
-          operators::FusionDequantAddBNReluKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNReluOp(const std::string &type,
-                           const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs,
-                           framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNParam<DeviceType>,
-            operators::FusionDequantAddBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
deleted file mode 100644
index 82eacd7f47dbfad56b99467d91ec849720ec787c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/fusion_dequant_add_bn_relu_quant_op.h"
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNReluQuantOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_relu_quant,
-                        ops::FusionDequantAddBNReluQuantMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_relu_quant,
-                      ops::FusionDequantAddBNReluQuantOp);
-#endif
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDequantAddBNQuantOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dequant_add_bn_quant,
-                        ops::FusionDequantAddBNQuantMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_add_bn_quant,
-                      ops::FusionDequantAddBNQuantOp);
-#endif
-
-#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
diff --git a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h b/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
deleted file mode 100644
index 6caa8daeb3f54312462b185f78ff07fbdf69cd7d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_add_bn_relu_quant_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-class FusionDequantAddBNReluQuantMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNReluQuantMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU) >
-        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU_QUANT; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNReluQuantOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
-          operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNReluQuantOp(const std::string &type,
-                                const VariableNameMap &inputs,
-                                const VariableNameMap &outputs,
-                                const framework::AttributeMap &attrs,
-                                framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNReluQuantParam<DeviceType>,
-            operators::FusionDequantAddBNReluQuantKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-class FusionDequantAddBNQuantMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantAddBNQuantMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_QUANTIZE);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}},
-                  {G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN_QUANT; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantAddBNQuantOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
-          operators::FusionDequantAddBNQuantKernel<DeviceType, T>> {
- public:
-  FusionDequantAddBNQuantOp(const std::string &type,
-                            const VariableNameMap &inputs,
-                            const VariableNameMap &outputs,
-                            const framework::AttributeMap &attrs,
-                            framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantAddBNQuantParam<DeviceType>,
-            operators::FusionDequantAddBNQuantKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fusion_dequant_bn_op.cpp b/mobile/src/operators/fusion_dequant_bn_op.cpp
deleted file mode 100644
index 3c944c015813e18c7cab1406d4896d2156cefa45..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_bn_op.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/fusion_dequant_bn_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef FUSION_DEQUANT_BN_OP
-template <typename Dtype, typename T>
-void FusionDequantBNOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <typename Dtype, typename T>
-void FusionDequantBNReluOp<Dtype, T>::InferShape() const {
-  const auto& input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-}
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef FUSION_DEQUANT_BN_OP
-REGISTER_FUSION_MATCHER(fusion_dequant_bn, ops::FusionDequantBNMatcher);
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_bn, ops::FusionDequantBNOp);
-#endif  // PADDLE_MOBILE_CPU
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu,
-                        ops::FusionDequantBNReluMatcher);
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp);
-#endif  // PADDLE_MOBILE_CPU
-#endif  // FUSION_DEQUANT_BN_RELU_OP
diff --git a/mobile/src/operators/fusion_dequant_bn_op.h b/mobile/src/operators/fusion_dequant_bn_op.h
deleted file mode 100644
index ac2237b77acceab09ad9120a7d177ea5e0051697..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_bn_op.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_BN_RELU_OP)
-class FusionDequantBNMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantBNMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM);
-  }
-
-  virtual void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() override { return G_OP_TYPE_FUSION_DEQUANT_BN; }
-};
-#endif  // FUSION_DEQUANT_BN_OP || FUSION_DEQUANT_BN_RELU_OP
-
-#ifdef FUSION_DEQUANT_BN_OP
-template <typename DeviceType, typename T>
-class FusionDequantBNOp : public framework::OperatorWithKernel<
-                              DeviceType, FusionDequantBNParam<DeviceType>,
-                              operators::FusionDequantBNKernel<DeviceType, T>> {
- public:
-  FusionDequantBNOp(const std::string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantBNParam<DeviceType>,
-            operators::FusionDequantBNKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-class FusionDequantBNReluMatcher : public FusionDequantBNMatcher {
- public:
-  FusionDequantBNReluMatcher() : FusionDequantBNMatcher() {
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  virtual std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantBNParam<DeviceType>,
-          operators::FusionDequantBNReluKernel<DeviceType, T>> {
- public:
-  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantBNParam<DeviceType>,
-            operators::FusionDequantBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-};
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/fusion_dequant_bn_relu_op.h b/mobile/src/operators/fusion_dequant_bn_relu_op.h
deleted file mode 100644
index be3b5293a334ca8bf275deabbccf0693679cde18..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dequant_bn_relu_op.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dequant_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionDequantBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDequantBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEQUANTIZE);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "BNScale"},
-                    {"Mean", "BNMean"},
-                    {"Bias", "BNBias"},
-                    {"Variance", "BNVariance"},
-                    {"Y", "Out"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDequantBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDequantBNReluParam<DeviceType>,
-          operators::FusionDequantBNReluKernel<DeviceType, T>> {
- public:
-  FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDequantBNReluParam<DeviceType>,
-            operators::FusionDequantBNReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp b/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
deleted file mode 100644
index d4c04f67fc637266cf95af2e7fe518682e212d98..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
-  auto in_dims = this->param_.Input()->dims();
-  auto filter_dims = this->param_.Filter()->dims();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  int groups = this->param_.Groups();
-  std::vector<int> dilations = this->param_.Dilations();
-
-  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
-                         dilations.size() == paddings.size() &&
-                         paddings.size() == strides.size()),
-                        "ConvParam is not suitable");
-
-  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_dwconv_bn_relu_op.h b/mobile/src/operators/fusion_dwconv_bn_relu_op.h
deleted file mode 100644
index 0fb2e5c70cabb3cdadc56b1fc2f50148f5b42f0e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_dwconv_bn_relu_op.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/dwconv_bn_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionDWConvBNReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionDWConvBNReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_DEPTHWISE_CONV);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_BATCHNORM) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_BATCHNORM,
-                   {{"Scale", "Scale"},
-                    {"Mean", "Mean"},
-                    {"Bias", "Bias"},
-                    {"Variance", "Variance"}}}},
-                 removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_DWCONV_BN_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionDWConvBNReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionDWConvBNReluParam<DeviceType>,
-          operators::DWConvBNReluKernel<DeviceType, T>> {
- public:
-  FusionDWConvBNReluOp(const string &type, const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const framework::AttributeMap &attrs,
-                       framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionDWConvBNReluParam<DeviceType>,
-            operators::DWConvBNReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp b/mobile/src/operators/fusion_elementwise_add_relu_op.cpp
deleted file mode 100644
index def932a589cf224695fdacee44036fdc8f6e1e22..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_elementwise_add_relu_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/fusion_elementwise_add_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
-                        ops::FusioneElementwiseAddReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-// REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
-//                      ops::FusionElementwiseAddReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_elementwise_add_relu,
-                       ops::FusionElementwiseAddReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_elementwise_add_relu_op.h b/mobile/src/operators/fusion_elementwise_add_relu_op.h
deleted file mode 100644
index c90d4e041ede54988cd1e9089991c9df8c594ab4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_elementwise_add_relu_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusioneElementwiseAddReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusioneElementwiseAddReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_ELEMENTWISE_ADD);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_ELEMENTWISE_ADD_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionElementwiseAddReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, ElementwiseAddReluParam<DeviceType>,
-          operators::ElementwiseAddReluKernel<DeviceType, T>> {
- public:
-  FusionElementwiseAddReluOp(const string &type, const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const framework::AttributeMap &attrs,
-                             framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, ElementwiseAddReluParam<DeviceType>,
-            operators::ElementwiseAddReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/fusion_fc_op.cpp b/mobile/src/operators/fusion_fc_op.cpp
deleted file mode 100644
index 0e6bb28ea26f8ddeaa217a4e619fe8ccb09465ee..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_fc_op.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/fusion_fc_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionFcOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-  assert(x_mat_dims[1] == y_mat_dims[0]);
-
-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
-
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
-#endif
-
-#endif  // FUSION_FC_OP
diff --git a/mobile/src/operators/fusion_fc_op.h b/mobile/src/operators/fusion_fc_op.h
deleted file mode 100644
index a88add4584060079fd437c3fff2e4228571d186f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_fc_op.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionFcMatcher : public framework::FusionOpMatcher {
- public:
-  FusionFcMatcher() {
-    node_ = framework::Node(G_OP_TYPE_MUL);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FC; }
-};
-
-template <typename DeviceType, typename T>
-class FusionFcOp : public framework::OperatorWithKernel<
-                       DeviceType, FusionFcParam<DeviceType>,
-                       operators::FusionFcKernel<DeviceType, T>> {
- public:
-  FusionFcOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
-                                      operators::FusionFcKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_FC_OP
diff --git a/mobile/src/operators/fusion_fc_relu_op.cpp b/mobile/src/operators/fusion_fc_relu_op.cpp
deleted file mode 100644
index f47b220e36e1aa12d6f6fc94bf1d921cc8a314d2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_fc_relu_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FCRELU_OP
-
-#include "operators/fusion_fc_relu_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionFcReluOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-  assert(x_mat_dims[1] == y_mat_dims[0]);
-
-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
-
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(fusion_fc_relu, ops::FusionFcReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_fc_relu_op.h b/mobile/src/operators/fusion_fc_relu_op.h
deleted file mode 100644
index 253335c8f258aa8a21b639378744d9bd4767a344..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_fc_relu_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-#pragma once
-#include <string>
-#include <vector>
-
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-using std::vector;
-class FusionFcReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionFcReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_MUL);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_ELEMENTWISE_ADD) >
-        std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(),
-                 {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Z"}}}}, removed_nodes);
-  }
-
-  std::string Type() { return G_OP_TYPE_FUSION_FC_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionFcReluOp : public framework::OperatorWithKernel<
-                           DeviceType, FusionFcReluParam<DeviceType>,
-                           operators::FusionFcReluKernel<DeviceType, T>> {
- public:
-  FusionFcReluOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionFcReluParam<DeviceType>,
-            operators::FusionFcReluKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // FUSION_FC_RELU_OP
diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.cpp b/mobile/src/operators/fusion_instancenorm_relu_op.cpp
deleted file mode 100644
index f6299fa72db06e03f54d382cfb761580294042df..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_instancenorm_relu_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#include "operators/fusion_instancenorm_relu_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void FusionInstanceNormReluOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-REGISTER_FUSION_MATCHER(fusion_instancenorm_relu,
-                        ops::FusionInstanceNormReluMatcher);
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(fusion_instancenorm_relu, ops::FusionInstanceNormReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.h b/mobile/src/operators/fusion_instancenorm_relu_op.h
deleted file mode 100644
index 91551e65586b822d75336450b4cd0db2a7dd7d26..0000000000000000000000000000000000000000
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "framework/program/program-optimize/fusion_op_register.h"
-#include "operators/kernel/instancenorm_relu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
- public:
-  FusionInstanceNormReluMatcher() {
-    node_ = framework::Node(G_OP_TYPE_INSTANCENORM);
-    node_ > std::make_shared<framework::Node>(G_OP_TYPE_RELU);
-  }
-
-  void FolderNodes(
-      framework::Node *node,
-      std::vector<std::shared_ptr<framework::Node>> *removed_nodes) {
-    node->Folder(node_.Depth(), Type(), {}, removed_nodes);
-  }
-  std::string Type() { return G_OP_TYPE_FUSION_INSTANCENORM_RELU; }
-};
-
-template <typename DeviceType, typename T>
-class FusionInstanceNormReluOp
-    : public framework::OperatorWithKernel<
-          DeviceType, FusionInstanceNormReluParam<DeviceType>,
-          operators::InstanceNormReluKernel<DeviceType, T>> {
- public:
-  FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const framework::AttributeMap &attrs,
-                           framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, FusionInstanceNormReluParam<DeviceType>,
-            operators::InstanceNormReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp
deleted file mode 100644
index 90809f1d4c6495ab06a9a630681f9b07a31d2f01..0000000000000000000000000000000000000000
--- a/mobile/src/operators/grid_sampler_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRID_SAMPLER_OP
-
-#include "operators/grid_sampler_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void GridSamplerOp<Dtype, T>::InferShape() const {
-  auto x_dim = this->param_.InputX()->dims();
-  this->param_.Output()->Resize(x_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(grid_sampler, ops::GridSamplerOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h
deleted file mode 100644
index 9d142b9d47466d472234dfc5d214de1032b0c6e9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/grid_sampler_op.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRID_SAMPLER_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/grid_sampler_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef GRID_SAMPLER_OP
-DECLARE_OPERATOR(GridSampler, GridSamplerParam, GridSamplerKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/gru_op.cpp b/mobile/src/operators/gru_op.cpp
deleted file mode 100644
index db0936d00c1dcb0e90e8664660f92ed004c258b7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/gru_op.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#include "operators/gru_op.h"
-#include <vector>
-#include "common/enforce.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void GruOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputInput()->dims();
-  auto weight_dims = this->param_.InputWeight()->dims();
-  int input_size = input_dims[1];
-  int frame_size = weight_dims[0];
-  PADDLE_MOBILE_ENFORCE(
-      (input_size == frame_size * 3),
-      "The input_size must be 3 times of frame_size in GRUOp.");
-  PADDLE_MOBILE_ENFORCE(
-      (weight_dims[1] == frame_size * 3),
-      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-  if (this->param_.InputH0()) {
-    auto h0_dims = this->param_.InputH0()->dims();
-    PADDLE_MOBILE_ENFORCE((h0_dims[1] == frame_size),
-                          "The width of H0 must be equal to frame_size.");
-  }
-  if (this->param_.InputBias()) {
-    auto bias_dims = this->param_.InputBias()->dims();
-    int bias_height = bias_dims[0];
-    int bias_width = bias_dims[1];
-    PADDLE_MOBILE_ENFORCE((bias_height == 1),
-                          "The shape of Bias must be [1, frame_size * 3].");
-    PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3),
-                          "The shape of Bias must be [1, frame_size * 3].");
-  }
-  this->param_.OutBatchGate()->Resize(input_dims);
-  this->param_.OutBatchResetHiddenPrev()->Resize({input_dims[0], frame_size});
-  this->param_.OutBatchHidden()->Resize({input_dims[0], frame_size});
-  this->param_.OutHidden()->Resize({input_dims[0], frame_size});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(gru, ops::GruOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/gru_op.h b/mobile/src/operators/gru_op.h
deleted file mode 100644
index 80bbd7c22233edba9902f636ae3c11c484761e2b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/gru_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/gru_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class GruOp : public framework::OperatorWithKernel<
-                  DeviceType, GruParam<DeviceType>,
-                  operators::GruKernel<DeviceType, T>> {
- public:
-  GruOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, GruParam<DeviceType>,
-                                      operators::GruKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/gru_unit_op.cpp b/mobile/src/operators/gru_unit_op.cpp
deleted file mode 100644
index 5dd1cd3dd38efc916f428d4408d9d53274c56e89..0000000000000000000000000000000000000000
--- a/mobile/src/operators/gru_unit_op.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#include "operators/gru_unit_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void GruUnitOp<DeviceType, T>::InferShape() const {
-  auto input_dims = this->param_.InputInput()->dims();
-  auto hidden_prev_dims = this->param_.InputHiddenPrev()->dims();
-  auto weight_dims = this->param_.InputWeight()->dims();
-  int batch_size = input_dims[0];
-  int input_size = input_dims[1];
-  int frame_size = hidden_prev_dims[1];
-  int weight_height = weight_dims[0];
-  int weight_width = weight_dims[1];
-  PADDLE_MOBILE_ENFORCE(
-      (input_size == frame_size * 3),
-      "The input_size must be 3 times of frame_size in GRUUnitOp.");
-  PADDLE_MOBILE_ENFORCE(
-      (weight_height == frame_size),
-      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-  PADDLE_MOBILE_ENFORCE(
-      (weight_width == frame_size * 3),
-      "The shape of Weight matrix must be [frame_size, frame_size * 3].");
-  if (this->param_.InputBias()) {
-    auto bias_dims = this->param_.InputBias()->dims();
-    int bias_height = bias_dims[0];
-    int bias_width = bias_dims[1];
-    PADDLE_MOBILE_ENFORCE((bias_height == 1),
-                          "The shape of Bias must be [1, frame_size * 3].");
-    PADDLE_MOBILE_ENFORCE((bias_width == frame_size * 3),
-                          "The shape of Bias must be [1, frame_size * 3].");
-  }
-  this->param_.OutGate()->Resize({batch_size, frame_size * 3});
-  this->param_.OutResetHiddenPrev()->Resize({batch_size, frame_size});
-  this->param_.OutHidden()->Resize({batch_size, frame_size});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(gru_unit, ops::GruUnitOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/gru_unit_op.h b/mobile/src/operators/gru_unit_op.h
deleted file mode 100644
index 8821212bfa3232e2713dfae86513693c09af8290..0000000000000000000000000000000000000000
--- a/mobile/src/operators/gru_unit_op.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/gru_unit_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class GruUnitOp : public framework::OperatorWithKernel<
-                      DeviceType, GruUnitParam<DeviceType>,
-                      operators::GruUnitKernel<DeviceType, T>> {
- public:
-  GruUnitOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs, const AttributeMap &attrs,
-            framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, GruUnitParam<DeviceType>,
-                                      operators::GruUnitKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/im2sequence_op.cpp b/mobile/src/operators/im2sequence_op.cpp
deleted file mode 100644
index 75a3c8c350ca358c9b4ee9460a293cbeb249a9cd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/im2sequence_op.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#include "operators/im2sequence_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-int Im2SequenceOutputSize(int input_size, int kernel, int padding_1,
-                          int padding_2, int stride) {
-  int output_size =
-      1 + (padding_1 + padding_2 + input_size - kernel + stride - 1) / stride;
-  return output_size;
-}
-
-template <typename Dtype, typename T>
-void Im2SequenceOp<Dtype, T>::InferShape() const {
-  auto in_x_dims = this->param_.Input()->dims();
-  const std::vector<int> &kernels = this->param_.Kernels();
-  const std::vector<int> &strides = this->param_.Strides();
-  std::vector<int> paddings = this->param_.Paddings();
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-
-  for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(Im2SequenceOutputSize(in_x_dims[i + 2], kernels[i],
-                                                 paddings[i], paddings[i + 2],
-                                                 strides[i]));
-  }
-  framework::DDim ddim = framework::make_ddim(output_shape);
-  this->param_.Output()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(im2sequence, ops::Im2SequenceOp);
-#endif
-
-#endif  // IM2SEQUENCE_OP
diff --git a/mobile/src/operators/im2sequence_op.h b/mobile/src/operators/im2sequence_op.h
deleted file mode 100644
index 4361380b8f7a4c4c89b63e7c1125e0c60dc79eba..0000000000000000000000000000000000000000
--- a/mobile/src/operators/im2sequence_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/im2sequence_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Im2SequenceOp : public framework::OperatorWithKernel<
-                          DeviceType, Im2SequenceParam<DeviceType>,
-                          operators::Im2SequenceKernel<DeviceType, T>> {
- public:
-  Im2SequenceOp(const std::string &type, const VariableNameMap &inputs,
-                const VariableNameMap &outputs,
-                const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, Im2SequenceParam<DeviceType>,
-            operators::Im2SequenceKernel<DeviceType, T>>(type, inputs, outputs,
-                                                         attrs, scope) {}
-
-  void InferShape() const override;
-
- private:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/increment_op.cpp b/mobile/src/operators/increment_op.cpp
deleted file mode 100644
index 7a04ae9b77df6d1750924f89a963008988744acb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/increment_op.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#include "operators/increment_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void IncrementOp<Dtype, T>::InferShape() const {
-  auto input = this->param_.InputX();
-  auto out = this->param_.Out();
-  PADDLE_MOBILE_ENFORCE(input->numel() == 1, "input's numel should be 1");
-  out->Resize(input->dims());
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    out->set_lod(input->lod());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(increment, ops::IncrementOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/increment_op.h b/mobile/src/operators/increment_op.h
deleted file mode 100644
index e0455b911342f44be2001fa858dca500bab2b591..0000000000000000000000000000000000000000
--- a/mobile/src/operators/increment_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/increment_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class IncrementOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           IncrementParam<DeviceType>,
-                                           IncrementKernel<DeviceType, T>> {
- public:
-  IncrementOp(const string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, IncrementParam<DeviceType>,
-                                      IncrementKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/instancenorm_op.cpp b/mobile/src/operators/instancenorm_op.cpp
deleted file mode 100644
index 42af75ca21ba4a70a78c50fa34ab674278bea743..0000000000000000000000000000000000000000
--- a/mobile/src/operators/instancenorm_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#include "operators/instancenorm_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void InstanceNormOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.OutputY()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(instance_norm, ops::InstanceNormOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/instancenorm_op.h b/mobile/src/operators/instancenorm_op.h
deleted file mode 100644
index 0047ce47ad07aea5882d2df0756ac673ed35d9a3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/instancenorm_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/instancenorm_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class InstanceNormOp
-    : public framework::OperatorWithKernel<DeviceType,
-                                           InstanceNormParam<DeviceType>,
-                                           InstanceNormKernel<DeviceType, T>> {
- public:
-  InstanceNormOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, InstanceNormParam<DeviceType>,
-                                      InstanceNormKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/is_empty_op.cpp b/mobile/src/operators/is_empty_op.cpp
deleted file mode 100644
index e3d71c8427a4fd1d82c8491522a5c2ceacb9f120..0000000000000000000000000000000000000000
--- a/mobile/src/operators/is_empty_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IS_EMPTY_OP
-
-#include "operators/is_empty_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void IsEmptyOp<Dtype, T>::InferShape() const {
-  auto out = this->param_.Out();
-  out->Resize({1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(is_empty, ops::IsEmptyOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/is_empty_op.h b/mobile/src/operators/is_empty_op.h
deleted file mode 100644
index 1f31f25796fcb1c33f1b94a1dac81d714d1bbca5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/is_empty_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IS_EMPTY_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/is_empty_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class IsEmptyOp
-    : public framework::OperatorWithKernel<DeviceType, IsEmptyParam<DeviceType>,
-                                           IsEmptyKernel<DeviceType, T>> {
- public:
-  IsEmptyOp(const string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, IsEmptyParam<DeviceType>,
-                                      IsEmptyKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/activation_kernel.h b/mobile/src/operators/kernel/activation_kernel.h
deleted file mode 100644
index b27691d5218893ce13ed0f1fc8a4bda086990171..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/activation_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef RELU_OP
-DECLARE_KERNEL(Relu, ReluParam);
-DECLARE_KERNEL(Relu6, Relu6Param);
-#endif
-
-#ifdef SIGMOID_OP
-DECLARE_KERNEL(Sigmoid, SigmoidParam);
-#endif
-
-#ifdef TANH_OP
-DECLARE_KERNEL(Tanh, TanhParam);
-#endif
-
-#ifdef LOG_OP
-DECLARE_KERNEL(Log, ReluParam);
-#endif
-
-#ifdef LEAKY_RELU_OP
-DECLARE_KERNEL(LeakyRelu, LeakyReluParam);
-#endif
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/activation_kernel.cpp b/mobile/src/operators/kernel/arm/activation_kernel.cpp
deleted file mode 100644
index be8ebc532fd5580a849332688ce58b9b9a7d371b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/activation_kernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/activation_kernel.h"
-#include "common/types.h"
-#include "operators/kernel/central-arm-func/activation_arm_func.h"
-#include "operators/math/activation.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef RELU_OP
-template <>
-bool ReluKernel<CPU, float>::Init(ReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ReluKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, RELU>()(input, output);
-  output->set_lod(input->lod());
-}
-
-template <>
-bool Relu6Kernel<CPU, float>::Init(Relu6Param<CPU> *param) {
-  return true;
-}
-
-template <>
-void Relu6Kernel<CPU, float>::Compute(const Relu6Param<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  float threshold = param.getThreshold();
-  ActivationCompute<float, RELU6>()(input, output, threshold);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef SIGMOID_OP
-template <>
-bool SigmoidKernel<CPU, float>::Init(SigmoidParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SigmoidKernel<CPU, float>::Compute(const SigmoidParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, SIGMOID>()(input, output);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef TANH_OP
-template <>
-bool TanhKernel<CPU, float>::Init(TanhParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void TanhKernel<CPU, float>::Compute(const TanhParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, TANH>()(input, output);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef LOG_OP
-template <>
-bool LogKernel<CPU, float>::Init(ReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LogKernel<CPU, float>::Compute(const ReluParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, LOG>()(input, output);
-  output->set_lod(input->lod());
-}
-#endif
-
-#ifdef LEAKY_RELU_OP
-template <>
-bool LeakyReluKernel<CPU, float>::Init(LeakyReluParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LeakyReluKernel<CPU, float>::Compute(const LeakyReluParam<CPU> &param) {
-  const LoDTensor *input = param.InputX();
-  LoDTensor *output = param.Out();
-  ActivationCompute<float, LEAKY_RELU>()(input, output, param.Alpha());
-  output->set_lod(input->lod());
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
deleted file mode 100644
index c493d78bb0955593cfd2528bd54ec115daa66916..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/anchor_generator_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANCHOR_GENERATOR_OP
-
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AnchorGeneratorKernel<CPU, float>::Init(AnchorGeneratorParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void AnchorGeneratorKernel<CPU, float>::Compute(
-    const AnchorGeneratorParam<CPU> &param) {
-  // TODO(hjchen2)
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ANCHOR_GENERATOR_OP
diff --git a/mobile/src/operators/kernel/arm/assign_kernel.cpp b/mobile/src/operators/kernel/arm/assign_kernel.cpp
deleted file mode 100644
index 823bb3ca4188ad99e4b75492c7eed5d51745c177..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/assign_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#include "operators/kernel/assign_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AssignKernel<CPU, float>::Init(AssignParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void AssignKernel<CPU, float>::Compute(const AssignParam<CPU>& param) {
-  const auto* input = param.Input();
-  auto* out = param.Output();
-  out->mutable_data<float>();
-  framework::TensorCopy(*input, out);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp b/mobile/src/operators/kernel/arm/assign_value_kernel.cpp
deleted file mode 100644
index 2e98b9f77712936bb8601065fa401c5e41df18ce..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/assign_value_kernel.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#include "operators/kernel/assign_value_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-struct AssignValueOpFunctor {
-  framework::LoDTensor* output_;
-  const std::vector<int> shape_;
-  const std::vector<int> int32_values_;
-  const std::vector<float> fp32_values_;
-
-  AssignValueOpFunctor(framework::LoDTensor* output,
-                       const std::vector<int>& shape,
-                       const std::vector<float>& fp32_values,
-                       const std::vector<int>& int32_values)
-      : output_(output),
-        shape_(shape),
-        int32_values_(int32_values),
-        fp32_values_(fp32_values) {}
-
-  template <typename T>
-  inline void apply() const {
-    PADDLE_MOBILE_THROW_EXCEPTION("Assign value: not supported data type.");
-  }
-};
-
-template <>
-inline void AssignValueOpFunctor::apply<int>() const {
-  framework::TensorFromVector<int>(int32_values_, output_);
-  output_->Resize(framework::make_ddim(shape_));
-}
-
-template <>
-inline void AssignValueOpFunctor::apply<float>() const {
-  framework::TensorFromVector<float>(fp32_values_, output_);
-  output_->Resize(framework::make_ddim(shape_));
-}
-
-template <>
-bool AssignValueKernel<CPU, float>::Init(AssignValueParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void AssignValueKernel<CPU, float>::Compute(
-    const AssignValueParam<CPU>& param) {
-  framework::VisitDataType(
-      framework::ToDataType(param.dtype_),
-      AssignValueOpFunctor(param.output_, param.shape_, param.fp32_values_,
-                           param.int32_values_));
-}
-
-template <>
-bool AssignValueKernel<GPU_CL, float>::Init(AssignValueParam<GPU_CL>* param) {
-  return true;
-}
-
-template <>
-void AssignValueKernel<GPU_CL, float>::Compute(
-    const AssignValueParam<GPU_CL>& param) {
-  framework::VisitDataType(
-      framework::ToDataType(param.dtype_),
-      AssignValueOpFunctor(param.output_, param.shape_, param.fp32_values_,
-                           param.int32_values_));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_VALUE_OP
diff --git a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp b/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
deleted file mode 100644
index f31c4426db7d28234692742fcd670cb26ec50ab0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/batchnorm_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/kernel/batchnorm_kernel.h"
-#include "operators/kernel/central-arm-func/batchnorm_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BatchNormKernel<CPU, float>::Init(BatchNormParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BatchNormKernel<CPU, float>::Compute(const BatchNormParam<CPU> &param) {
-  BatchnormCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp b/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
deleted file mode 100644
index 97aaffe7c2fbb7a957748a4c7779d6e9785a9d95..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/beam_search_decode_kernel.cpp
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#include "operators/kernel/beam_search_decode_kernel.h"
-#include <algorithm>
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using LoDTensor = framework::LoDTensor;
-using LoDTensorArray = framework::LoDTensorArray;
-
-// all the lod have 2 levels.
-// The first is source level, the second is sentence level.
-// source level describe how many prefixes (branchs) for each source sentece
-// (beam). sentence level describe how these candidates belong to the prefixes.
-const size_t kSourceLevel = 0;
-const size_t kSentenceLevel = 1;
-
-template <typename T>
-struct Sentence {
-  std::vector<int64_t> word_ids;
-  std::vector<T> scores;
-};
-
-template <typename T>
-using SentenceVector = std::vector<Sentence<T>>;
-
-template <typename T>
-struct BeamSearchDecoder {
-  BeamSearchDecoder(size_t beam_size, int end_id)
-      : beam_size_(beam_size), end_id_(end_id) {}
-
-  /**
-   * convert the result sentence_vector for each source sentence into two
-   * LodTensor.
-   * One is all candidate sentences with word id, one is all candidate sentences
-   * with word score.
-   * Param:
-   *  sentence_vector_list: sentence_vector for each source sentence.
-   *  id_tensor: result LoDTensor for sentences of id.
-   *  score_tensor: result LoDTensor for sentences of score.
-   *  reverse: whether ids of sentence in sentence_vector_list is reversed
-   *  sort_by_score: whether to sort hypotheses of each sentence by scores.
-   */
-  void ConvertSentenceVectorToLodTensor(
-      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-      LoDTensor* score_tensor, bool reverse = true,
-      bool sort_by_score = true) const;
-
-  /**
-   * Gather the hypotheses for each source sentence by backtrace though the
-   * LoDTensorArray step_ids whose lods reserve the path in the tree.
-   */
-  void Backtrace(const LoDTensorArray& step_ids,
-                 const LoDTensorArray& step_scores, LoDTensor* id_tensor,
-                 LoDTensor* score_tensor) const;
-
-  size_t beam_size_;
-  int end_id_;
-};
-
-template <typename T>
-void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
-    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
-    LoDTensor* score_tensor, bool reverse, bool sort_by_score) const {
-  size_t src_num = sentence_vector_list.size();
-
-  PADDLE_MOBILE_ENFORCE(src_num > 0, "src_num should be larger than 0");
-
-  std::vector<size_t> source_level_lod = {0};
-  std::vector<size_t> sentence_level_lod = {0};
-  std::vector<int64_t> id_data;
-  std::vector<T> score_data;
-
-  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-    if (sort_by_score) {
-      sort(sentence_vector_list[src_idx].begin(),
-           sentence_vector_list[src_idx].end(),
-           [reverse](const Sentence<T>& a, const Sentence<T>& b) {
-             if (reverse)
-               return a.scores.front() > b.scores.front();
-             else
-               return a.scores.back() > b.scores.back();
-           });
-    }
-    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
-      if (reverse) {
-        id_data.insert(id_data.end(), sentence.word_ids.rbegin(),
-                       sentence.word_ids.rend());
-        score_data.insert(score_data.end(), sentence.scores.rbegin(),
-                          sentence.scores.rend());
-      } else {
-        id_data.insert(id_data.end(), sentence.word_ids.begin(),
-                       sentence.word_ids.end());
-        score_data.insert(score_data.end(), sentence.scores.begin(),
-                          sentence.scores.end());
-      }
-
-      sentence_level_lod.push_back(sentence_level_lod.back() +
-                                   sentence.word_ids.size());
-    }
-    source_level_lod.push_back(source_level_lod.back() +
-                               sentence_vector_list[src_idx].size());
-  }
-
-  framework::LoD lod;
-  lod.push_back(source_level_lod);
-  lod.push_back(sentence_level_lod);
-
-  id_tensor->set_lod(lod);
-  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
-  id_tensor->mutable_data<int64_t>();
-  framework::TensorFromVector<int64_t>(id_data, id_tensor);
-
-  score_tensor->set_lod(lod);
-  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
-  score_tensor->mutable_data<T>();
-  framework::TensorFromVector<T>(score_data, score_tensor);
-}
-
-template <typename T>
-void BeamSearchDecoder<T>::Backtrace(const LoDTensorArray& step_ids,
-                                     const LoDTensorArray& step_scores,
-                                     LoDTensor* id_tensor,
-                                     LoDTensor* score_tensor) const {
-  PADDLE_MOBILE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
-  PADDLE_MOBILE_ENFORCE(step_ids.size() == step_scores.size(),
-                        "step_ids and step_scores should be the same");
-  const size_t step_num = step_ids.size();
-  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
-  std::vector<SentenceVector<T>> sentence_vector_list(
-      src_num, SentenceVector<T>(beam_size_));
-  std::vector<std::vector<size_t>> prefix_idx_vector_list(src_num);
-  for (int step_id = step_num - 1; step_id >= 0; --step_id) {
-    auto& cur_ids = step_ids.at(step_id);
-    auto& cur_scores = step_scores.at(step_id);
-    for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
-      // for each source sentence
-      auto& sentence_vector = sentence_vector_list.at(src_idx);
-      auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx);
-      size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx];
-      size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
-      if (prefix_idx_vector.empty()) {  // be finished and pruned at this step
-                                        // or the last time step
-        for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end;
-             ++prefix_idx) {
-          size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          size_t candidate_end =
-              cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1];
-          for (size_t candidate_idx = candidate_start;
-               candidate_idx < candidate_end; ++candidate_idx) {
-            prefix_idx_vector.push_back(prefix_idx);
-            size_t idx = prefix_idx_vector.size() - 1;
-            auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
-            auto cur_score = cur_scores.data<T>()[candidate_idx];
-            sentence_vector.at(idx).word_ids.push_back(cur_id);
-            sentence_vector.at(idx).scores.push_back(cur_score);
-          }
-        }
-      } else {  // use prefix_idx_vector to backtrace
-        size_t src_candidate_start =
-            cur_ids.lod().at(kSentenceLevel)[src_prefix_start];
-        size_t prefix_idx = src_prefix_start;
-        size_t candidate_num =
-            cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-            cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-        for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) {
-          auto candidate_idx = prefix_idx_vector.at(idx);
-          auto cur_id = cur_ids.data<int64_t>()[candidate_idx];
-          auto cur_score = cur_scores.data<T>()[candidate_idx];
-          if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) {
-            // to skip redundant end tokens
-            sentence_vector.at(idx).word_ids.push_back(cur_id);
-            sentence_vector.at(idx).scores.push_back(cur_score);
-          }
-
-          while (src_candidate_start + candidate_num <=
-                 candidate_idx) {  // search the corresponding prefix
-            prefix_idx++;
-            candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] -
-                             cur_ids.lod().at(kSentenceLevel)[prefix_idx];
-          }
-          prefix_idx_vector.at(idx) = prefix_idx;
-        }
-      }
-    }
-  }
-
-  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
-                                   score_tensor, true, true);
-}
-
-struct BeamSearchDecodeFunctor {
-  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
-                          const LoDTensorArray& step_scores,
-                          LoDTensor* id_tensor, LoDTensor* score_tensor,
-                          size_t beam_size, int end_id)
-      : beam_size_(beam_size),
-        end_id_(end_id),
-        step_ids_(step_ids),
-        step_scores_(step_scores),
-        id_tensor_(id_tensor),
-        score_tensor_(score_tensor) {}
-
-  template <typename T>
-  void apply() const;
-
-  size_t beam_size_;
-  int end_id_;
-  const LoDTensorArray& step_ids_;
-  const LoDTensorArray& step_scores_;
-  LoDTensor* id_tensor_;
-  LoDTensor* score_tensor_;
-};
-
-template <typename T>
-void BeamSearchDecodeFunctor::apply() const {
-  BeamSearchDecoder<T> beam_search_decoder(beam_size_, end_id_);
-  beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_,
-                                score_tensor_);
-}
-
-template <>
-void BeamSearchDecodeFunctor::apply<bool>() const {
-  PADDLE_MOBILE_THROW_EXCEPTION("beam search decode op does not support bool.");
-}
-
-template <>
-bool BeamSearchDecodeKernel<CPU, float>::Init(
-    BeamSearchDecodeParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void BeamSearchDecodeKernel<CPU, float>::Compute(
-    const BeamSearchDecodeParam<CPU>& param) {
-  const LoDTensorArray* ids = param.ids_;
-  const LoDTensorArray* scores = param.scores_;
-
-  const size_t step_num = ids->size();
-  PADDLE_MOBILE_ENFORCE(step_num > 0,
-                        "beam search steps should be larger than 0");
-
-  for (size_t i = 0; i < step_num; ++i) {
-    PADDLE_MOBILE_ENFORCE(ids->at(i).lod().size() == 2,
-                          "Level of LodTensor should be 2");
-  }
-  const size_t source_num = ids->at(0).lod().at(0).size() - 1;
-  PADDLE_MOBILE_ENFORCE(source_num > 0, "source num should be larger than 0");
-
-  LoDTensor* sentence_ids = param.sentence_ids_;
-  LoDTensor* sentence_scores = param.sentence_scores_;
-
-  framework::VisitDataType(
-      framework::ToDataType(scores->at(0).type()),
-      BeamSearchDecodeFunctor(*ids, *scores, sentence_ids, sentence_scores,
-                              param.beam_size_, param.end_id_));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp b/mobile/src/operators/kernel/arm/beam_search_kernel.cpp
deleted file mode 100644
index 9128c57c64617b338d5948519298f80241a29545..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/beam_search_kernel.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#include "operators/kernel/beam_search_kernel.h"
-#include <cmath>
-#include <numeric>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Device, typename T>
-class BeamSearchFunctor {
- public:
-  void operator()(const framework::LoDTensor *pre_ids,
-                  const framework::LoDTensor *pre_scores,
-                  const framework::LoDTensor *ids,
-                  const framework::LoDTensor *scores,
-                  framework::LoDTensor *selected_ids,
-                  framework::LoDTensor *selected_scores,
-                  framework::Tensor *parent_idx, size_t level, size_t beam_size,
-                  int end_id, bool is_accumulated) {
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-    auto &high_level = abs_lod[level];
-
-    auto items = SelectTopBeamSizeItems(pre_ids, pre_scores, ids, scores, level,
-                                        beam_size, end_id, is_accumulated);
-    auto selected_items = ToMap(items, high_level.back());
-
-    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
-    // calculate the output tensor's height
-    size_t num_instances = std::accumulate(
-        std::begin(selected_items), std::end(selected_items), 0,
-        [](size_t a, std::vector<Item> &b) { return a + b.size(); });
-    // the output tensor shape should be [num_instances, 1]
-    auto dims = framework::make_ddim(
-        std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-    selected_ids->Resize(dims);
-    selected_scores->Resize(dims);
-    parent_idx->Resize({static_cast<int64_t>(num_instances)});
-
-    auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
-    auto *selected_scores_data = selected_scores->mutable_data<float>();
-    auto *parent_idx_data = parent_idx->mutable_data<int>();
-
-    // fill in data
-    std::vector<size_t> low_level;
-    size_t low_offset = 0;
-    for (auto &items : selected_items) {
-      low_level.push_back(low_offset);
-      for (auto &item : items) {
-        parent_idx_data[low_offset] = static_cast<int>(low_level.size() - 1);
-        selected_ids_data[low_offset] = item.id;
-        selected_scores_data[low_offset] = item.score;
-        low_offset++;
-      }
-    }
-    low_level.push_back(low_offset);
-
-    // fill lod
-    framework::LoD lod(2);
-    lod[0].assign(high_level.begin(), high_level.end());
-    lod[1].assign(low_level.begin(), low_level.end());
-    selected_ids->set_lod(lod);
-    selected_scores->set_lod(lod);
-  }
-
-  /*
-   * The basic items help to sort.
-   */
-  struct Item {
-    Item() {}
-    Item(size_t offset, size_t id, float score)
-        : offset(offset), id(id), score(score) {}
-    // offset in the higher lod level.
-    size_t offset;
-    // prefix id in the lower lod level.
-    // size_t prefix;
-    // the candidate id
-    size_t id;
-    // the corresponding score
-    float score;
-
-    inline bool operator<(const Item &in) const {
-      return (score < in.score) ||
-             ((score == in.score) && (offset < in.offset));
-    }
-
-    inline void operator=(const Item &in) {
-      offset = in.offset;
-      id = in.id;
-      score = in.score;
-    }
-  };
-
- protected:
-  /*
-   * Prune the source sentences all branchs finished, and it is optional.
-   * Pruning must one step later than finishing (thus pre_ids is needed here),
-   * since the end tokens must be writed out.
-   */
-  void PruneEndBeams(const framework::LoDTensor *pre_ids,
-                     const framework::LoD &abs_lod,
-                     std::vector<std::vector<Item>> *items, size_t lod_level,
-                     int end_id) {
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto &high_level = abs_lod[lod_level];
-    for (size_t src_idx = 0; src_idx < high_level.size() - 1; ++src_idx) {
-      size_t src_prefix_start = high_level[src_idx];
-      size_t src_prefix_end = high_level[src_idx + 1];
-      bool finish_flag = true;
-      for (size_t offset = src_prefix_start; offset < src_prefix_end;
-           offset++) {
-        for (auto &item : items->at(offset)) {
-          if (item.id != static_cast<size_t>(end_id) ||
-              pre_ids_data[offset] != end_id) {
-            finish_flag = false;
-            break;
-          }
-        }
-        if (!finish_flag) break;
-      }
-      if (finish_flag) {  // all branchs of the beam (source sentence) end and
-                          // prune this beam
-        for (size_t offset = src_prefix_start; offset < src_prefix_end;
-             offset++)
-          items->at(offset).clear();
-      }
-    }
-  }
-
-  /*
-   * Transform the items into a map whose key is offset, value is the items.
-   * NOTE low performance.
-   */
-  std::vector<std::vector<Item>> ToMap(
-      const std::vector<std::vector<Item>> &items, size_t element_num) {
-    std::vector<std::vector<Item>> result;
-    result.resize(element_num);
-    for (auto &entries : items) {
-      for (const auto &item : entries) {
-        result[item.offset].push_back(item);
-      }
-    }
-    return result;
-  }
-
-  void Insert(std::vector<Item> *top_beam_ptr, const Item &item,
-              size_t beam_size) {
-    std::vector<Item> &top_beam = *top_beam_ptr;
-
-    size_t num_beams = top_beam.size();
-    if (num_beams < beam_size) {
-      top_beam.resize(num_beams + 1);
-      num_beams++;
-    } else {
-      if (item < top_beam[beam_size - 1]) {
-        return;
-      }
-    }
-
-    for (int k = static_cast<int>(num_beams) - 2; k >= 0; --k) {
-      if (top_beam[k] < item) {
-        top_beam[k + 1] = top_beam[k];
-      } else {
-        top_beam[k + 1] = item;
-        return;
-      }
-    }
-    top_beam[0] = item;
-  }
-
-  /*
-   * For each source, select top beam_size records.
-   */
-  std::vector<std::vector<Item>> SelectTopBeamSizeItems(
-      const framework::LoDTensor *pre_ids,
-      const framework::LoDTensor *pre_scores, const framework::LoDTensor *ids,
-      const framework::LoDTensor *scores, size_t lod_level, size_t beam_size,
-      int end_id, bool is_accumulated) {
-    std::vector<std::vector<Item>> result;
-
-    // find the current candidates
-    auto abs_lod = framework::ToAbsOffset(scores->lod());
-
-    auto *pre_ids_data = pre_ids->data<int64_t>();
-    auto *pre_scores_data = pre_scores->data<float>();
-
-    auto *ids_data = ids ? ids->data<int64_t>() : nullptr;
-    auto *scores_data = scores->data<float>();
-
-    size_t num_seqs = scores->NumElements(lod_level);
-    size_t seq_width = 1;
-    for (int i = 1; i < scores->dims().size(); i++) {
-      seq_width *= scores->dims()[i];
-    }
-
-    for (size_t seq_id = 0; seq_id < num_seqs; ++seq_id) {
-      size_t seq_offset_start = abs_lod[lod_level][seq_id];
-      size_t seq_offset_end = abs_lod[lod_level][seq_id + 1];
-
-      std::vector<Item> top_beam;
-      top_beam.reserve(beam_size);
-
-      for (size_t offset = seq_offset_start; offset < seq_offset_end;
-           ++offset) {
-        auto pre_id = pre_ids_data[offset];
-        auto pre_score = pre_scores_data[offset];
-        if (pre_id == end_id) {
-          // Allocate all probability mass to end_id for finished branchs and
-          // the other candidate ids can be ignored.
-          Item item(offset, end_id, pre_score);
-          Insert(&top_beam, item, beam_size);
-        } else {
-          size_t index = offset * seq_width;
-          for (size_t d = 0; d < seq_width; d++, index++) {
-            int64_t id = ids_data ? ids_data[index] : static_cast<int64_t>(d);
-            float score = is_accumulated
-                              ? scores_data[index]
-                              : pre_score + std::log(scores_data[index]);
-            Item item(offset, id, score);
-            Insert(&top_beam, item, beam_size);
-          }
-        }
-      }
-
-      result.emplace_back(top_beam);
-    }
-
-    return result;
-  }
-};
-
-template <>
-bool BeamSearchKernel<CPU, float>::Init(BeamSearchParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BeamSearchKernel<CPU, float>::Compute(const BeamSearchParam<CPU> &param) {
-  BeamSearchFunctor<CPU, float> alg;
-  alg(param.pre_ids_, param.pre_scores_, param.ids_, param.scores_,
-      param.selected_ids_, param.selected_scores_, param.parent_idx_,
-      param.level_, param.beam_size_, param.end_id_, param.is_accumulated_);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp b/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
deleted file mode 100644
index 85192e28edf8351bd8be540b27aa986b2c458d0d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#include "operators/kernel/bilinear_interp_kernel.h"
-#include "operators/kernel/central-arm-func/bilinear_interp_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BilinearInterpKernel<CPU, float>::Init(BilinearInterpParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BilinearInterpKernel<CPU, float>::Compute(
-    const BilinearInterpParam<CPU> &param) {
-  BilinearInterpCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp b/mobile/src/operators/kernel/arm/box_coder_kernel.cpp
deleted file mode 100644
index 30ede12dffe0eed7673c9ae1f7c836fd1b5b7096..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/box_coder_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#include "operators/kernel/box_coder_kernel.h"
-#include "operators/kernel/central-arm-func/box_coder_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BoxCoderKernel<CPU, float>::Init(BoxCoderParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void BoxCoderKernel<CPU, float>::Compute(const BoxCoderParam<CPU> &param) {
-  BoxCoderCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/cast_kernel.cpp b/mobile/src/operators/kernel/arm/cast_kernel.cpp
deleted file mode 100644
index 166e8211725ab4ec511622fa52522cef5a8a300b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/cast_kernel.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CAST_OP
-
-#include <algorithm>
-#include <vector>
-#include "framework/data_type.h"
-#include "operators/kernel/kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename InT>
-struct CastOutOpFunctor {
-  const framework::Tensor* in_;
-  framework::Tensor* out_;
-  CastOutOpFunctor(const framework::Tensor* in, framework::Tensor* out)
-      : in_(in), out_(out) {}
-
-  template <typename OutT>
-  void apply() const {
-    const InT* input = in_->data<InT>();
-    OutT* output = out_->mutable_data<OutT>();
-    size_t numel = in_->numel();
-    for (int i = 0; i < numel; ++i) {
-      output[i] = static_cast<OutT>(input[i]);
-    }
-  }
-};
-
-// struct CastOpFunctor {
-//  const framework::Tensor* in_;
-//  framework::Tensor* out_;
-//  int output_type_;
-//  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-//                const int output_type)
-//      : in_(in), out_(out), output_type_(output_type) {}
-//
-//  template <typename InT>
-//  void apply() const {
-//    framework::VisitDataType(framework::ToDataType(output_type_),
-//                             CastOutOpFunctor<InT>(in_, out_));
-//  }
-//};
-
-template <>
-bool CastKernel<CPU, float>::Init(CastParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void CastKernel<CPU, float>::Compute(const CastParam<CPU>& param) {
-  const Tensor* input = param.input_;
-  Tensor* output = param.output_;
-  if (input->type() == type_id<float>()) {
-    framework::VisitDataType(framework::ToDataType(param.output_type_),
-                             CastOutOpFunctor<float>(input, output));
-  } else if (input->type() == type_id<int64_t>()) {
-    framework::VisitDataType(framework::ToDataType(param.output_type_),
-                             CastOutOpFunctor<int64_t>(input, output));
-  } else if (input->type() == type_id<int>()) {
-    framework::VisitDataType(framework::ToDataType(param.output_type_),
-                             CastOutOpFunctor<int>(input, output));
-  } else {
-    PADDLE_MOBILE_ENFORCE(0, "input tpye not support now!")
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CAST_OP
diff --git a/mobile/src/operators/kernel/arm/compare_kernel.cpp b/mobile/src/operators/kernel/arm/compare_kernel.cpp
deleted file mode 100644
index d321740fd283ec16e64a90203c213ce41fe639b1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/compare_kernel.cpp
+++ /dev/null
@@ -1,274 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/compare_kernel.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-typedef enum {
-  LESS_THAN = 0,
-  LESS_EQUAL = 1,
-  GREATER_THAN = 2,
-  GREATER_EQUAL = 3,
-  EQUAL = 4,
-  NOT_EQUAL = 5,
-} CompareType;
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <CompareType Comp = LESS_THAN>
-inline uint32x4_t vcmpq_f32(const float32x4_t x, const float32x4_t y) {
-  return vcleq_f32(x, y);
-}
-#endif
-
-template <CompareType Comp = LESS_THAN>
-inline uint8_t Compare(const float x, const float y) {
-  return static_cast<uint8_t>(x < y);
-}
-
-template <CompareType Comp = EQUAL>
-inline uint8_t Compare(const int x, const int y) {
-  return static_cast<uint8_t>(x == y);
-}
-
-template <CompareType Comp = LESS_THAN>
-inline uint8_t Compare(const int64_t x, const int64_t y) {
-  return static_cast<uint8_t>(x < y);
-}
-
-template <typename Dtype, CompareType Comp>
-struct CompareCompute {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {}
-};
-
-template <CompareType Comp>
-struct CompareCompute<float, Comp> {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {
-    const float *x = X->data<float>();
-    const float *y = Y->data<float>();
-    uint8_t *output = reinterpret_cast<uint8_t *>(Out->mutable_data<bool>());
-    const auto &x_dims = X->dims();
-    const auto &y_dims = Y->dims();
-    /// axis = -1 represent the last dimensions.
-    int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-    int batch = 1;
-    int channels = 1;
-    int elementwise_num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-    // if elementwise_num == 1, compare rowwise
-    if (elementwise_num == 1) {
-      int remain_start = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      remain_start = channels & 0xfffffff8;
-      uint8x8_t __mask = vdup_n_u8(0x1);
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels - 7; j += 8) {
-          int x_offset = i * channels + j;
-          float32x4_t __x0 = vld1q_f32(x + x_offset);
-          float32x4_t __x1 = vld1q_f32(x + x_offset + 4);
-          float32x4_t __y0 = vld1q_f32(y + j);
-          float32x4_t __y1 = vld1q_f32(y + j + 4);
-          uint32x4_t __cmp0 = vcmpq_f32<Comp>(__x0, __y0);
-          uint32x4_t __cmp1 = vcmpq_f32<Comp>(__x1, __y1);
-          uint16x4_t __ncmp0 = vmovn_u32(__cmp0);
-          uint16x4_t __ncmp1 = vmovn_u32(__cmp1);
-          uint16x8_t __ncmp = vcombine_u16(__ncmp0, __ncmp1);
-          uint8x8_t __nncmp = vmovn_u16(__ncmp);
-          __nncmp = vand_u8(__nncmp, __mask);
-          vst1_u8(output + x_offset, __nncmp);
-        }
-      }
-#endif  // __ARM_NEON__
-      for (int i = 0; i < batch; ++i) {
-        for (int j = remain_start; j < channels; ++j) {
-          int x_offset = i * channels + j;
-          output[x_offset] = Compare<Comp>(x[x_offset], y[j]);
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = (i * channels + j) * elementwise_num;
-          int y_offset = j * elementwise_num;
-          int remain_start = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-          remain_start = elementwise_num & 0xfffffff8;
-          uint8x8_t __mask = vdup_n_u8(0x1);
-          for (int k = 0; k < elementwise_num - 7; k += 8) {
-            float32x4_t __x0 = vld1q_f32(x + x_offset);
-            float32x4_t __x1 = vld1q_f32(x + x_offset + 4);
-            float32x4_t __y0 = vld1q_f32(y + y_offset);
-            uint32x4_t __cmp0 = vcmpq_f32<Comp>(__x0, __y0);
-            uint32x4_t __cmp1 = vcmpq_f32<Comp>(__x1, __y0);
-            uint16x4_t __ncmp0 = vmovn_u32(__cmp0);
-            uint16x4_t __ncmp1 = vmovn_u32(__cmp1);
-            uint16x8_t __ncmp = vcombine_u16(__ncmp0, __ncmp1);
-            uint8x8_t __nncmp = vmovn_u16(__ncmp);
-            __nncmp = vand_u8(__nncmp, __mask);
-            vst1_u8(output + x_offset, __nncmp);
-            x_offset += 8;
-            y_offset += 8;
-          }
-#endif  // __ARM_NEON__
-          for (int k = remain_start; k < elementwise_num; ++k) {
-            output[x_offset + k] = Compare<Comp>(x[x_offset + k], y[y_offset]);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <CompareType Comp>
-struct CompareCompute<int64_t, Comp> {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {
-    const int64_t *x = X->data<int64_t>();
-    const int64_t *y = Y->data<int64_t>();
-    uint8_t *output = reinterpret_cast<uint8_t *>(Out->mutable_data<bool>());
-    const auto &x_dims = X->dims();
-    const auto &y_dims = Y->dims();
-    /// axis = -1 represent the last dimensions.
-    int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-    int batch = 1;
-    int channels = 1;
-    int elementwise_num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-    // if elementwise_num == 1, compare rowwise
-    if (elementwise_num == 1) {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = i * channels + j;
-          output[x_offset] = Compare<Comp>(x[x_offset], y[j]);
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = (i * channels + j) * elementwise_num;
-          int y_offset = j * elementwise_num;
-          for (int k = 0; k < elementwise_num; ++k) {
-            output[x_offset + k] = Compare<Comp>(x[x_offset + k], y[y_offset]);
-          }
-        }
-      }
-    }
-  }
-};
-
-template <CompareType Comp>
-struct CompareCompute<int, Comp> {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {
-    const int *x = X->data<int>();
-    const int *y = Y->data<int>();
-    uint8_t *output = reinterpret_cast<uint8_t *>(Out->mutable_data<bool>());
-    const auto &x_dims = X->dims();
-    const auto &y_dims = Y->dims();
-    /// axis = -1 represent the last dimensions.
-    int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-    int batch = 1;
-    int channels = 1;
-    int elementwise_num = 1;
-    for (int i = 0; i < axis; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-    // if elementwise_num == 1, compare rowwise
-    if (elementwise_num == 1) {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = i * channels + j;
-          output[x_offset] = Compare<Comp>(x[x_offset], y[j]);
-        }
-      }
-    } else {
-      for (int i = 0; i < batch; ++i) {
-        for (int j = 0; j < channels; ++j) {
-          int x_offset = (i * channels + j) * elementwise_num;
-          int y_offset = j * elementwise_num;
-          for (int k = 0; k < elementwise_num; ++k) {
-            output[x_offset + k] = Compare<Comp>(x[x_offset + k], y[y_offset]);
-          }
-        }
-      }
-    }
-  }
-};
-
-#ifdef LESS_THAN_OP
-template <>
-bool LessThanKernel<CPU, float>::Init(CompareParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LessThanKernel<CPU, float>::Compute(const CompareParam<CPU> &param) {
-  if (param.input_x_->type() == type_id<int64_t>().hash_code()) {
-    CompareCompute<int64_t, LESS_THAN>()(param.input_x_, param.input_y_,
-                                         param.axis_, param.output_);
-  } else if (param.input_x_->type() == type_id<float>().hash_code()) {
-    CompareCompute<float, LESS_THAN>()(param.input_x_, param.input_y_,
-                                       param.axis_, param.output_);
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(
-        "LessThan only support int64_t and float data type.");
-  }
-}
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-template <>
-bool EqualKernel<CPU, float>::Init(CompareParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void EqualKernel<CPU, float>::Compute(const CompareParam<CPU> &param) {
-  if (param.input_x_->type() == type_id<int>().hash_code()) {
-    CompareCompute<int, EQUAL>()(param.input_x_, param.input_y_, param.axis_,
-                                 param.output_);
-  }
-}
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/concat_kernel.cpp b/mobile/src/operators/kernel/arm/concat_kernel.cpp
deleted file mode 100644
index 3e585ec7214239ab5752d197e207fd55c6dffa68..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/concat_kernel.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-#include "operators/kernel/central-arm-func/concat_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
-  if (param.Inputs()[0]->type() == type_id<int8_t>().hash_code()) {
-    ConcatCompute<int8_t>(param);
-  } else {
-    ConcatCompute<float>(param);
-  }
-  param.Out()->set_lod(param.Inputs()[0]->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp b/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
deleted file mode 100644
index a5530559d1a3a90996eb1a4ed94b31b85edad521..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/conditional_block_kernel.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#include "operators/kernel/conditional_block_kernel.h"
-#include <framework/program/block_desc.h>
-#include <framework/program/op_desc.h>
-#include <algorithm>
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class StepExecutor {
-  typedef std::shared_ptr<framework::OperatorBase<CPU>> OperatorPtr;
-
- public:
-  StepExecutor(const framework::BlockDesc *block, framework::Scope *scope)
-      : scope_(scope) {
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block->Ops();
-    ops_of_block_.resize(ops.size());
-    for (int i = 0; i < ops.size(); ++i) {
-      std::shared_ptr<framework::OpDesc> op_desc = ops[i];
-      DLOG << "conditional block create op: " << ops.size() << ","
-           << op_desc->Type();
-      auto op_handler = framework::OpRegistry<CPU>::CreateOp(
-          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-          op_desc->GetAttrMap(), scope_);
-      op_handler->Init();
-      ops_of_block_[i] = op_handler;
-    }
-  }
-
-  void Run() {
-    for (int i = 0; i < ops_of_block_.size(); ++i) {
-      auto &op_handler = ops_of_block_[i];
-      DLOG << "conditional block op InferShape: " << i
-           << "th: " << op_handler->Type();
-      op_handler->InferShape();
-      DLOG << "conditional block op Run: " << i << "th: " << op_handler->Type();
-      op_handler->Run();
-    }
-  }
-
- private:
-  framework::Scope *scope_;
-  std::vector<OperatorPtr> ops_of_block_;
-};
-
-template <>
-bool ConditionalBlockKernel<CPU, float>::Init(
-    ConditionalBlockParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConditionalBlockKernel<CPU, float>::Compute(
-    const ConditionalBlockParam<CPU> &param) {
-  bool need_run;
-  if (param.isScalarCondition()) {
-    auto xs = param.Cond();
-    PADDLE_MOBILE_ENFORCE(
-        xs[0]->type() == type_id<bool>().hash_code() && xs[0]->numel() == 1,
-        "condition input's data type should be bool, "
-        "numel should be 1, actual numel is %d",
-        xs[0]->numel());
-    need_run = xs[0]->data<bool>()[0];
-  } else {
-    auto xs = param.Input();
-    need_run = std::all_of(
-        xs.begin(), xs.end(),
-        [](const framework::LoDTensor *t) { return t->numel() != 0; });
-  }
-
-  if (need_run) {
-    auto input = param.Input();
-    auto sub = param.getSubBlock();
-    auto &current_scope = param.GetScope()->NewScope();
-    StepExecutor executor(sub, &current_scope);
-    executor.Run();
-    param.GetScope()->DeleteScope(&current_scope);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONDITIONAL_BLOCK_OP
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 229b96b550e8f8c1f693768898ac879e486b56a8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/context.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/slidingwindow_utils.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNReluKernel<CPU, float>::Init(
-    FusionConvAddBNReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const Tensor *bias1 = param->Bias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-  auto bias1_ptr = bias1->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Variable *scale_var = param->GetScope()->Var();
-  Variable *bias_var = param->GetScope()->Var();
-  LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
-  LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
-  float *new_scale_ptr = new_scale->mutable_data<float>({C});
-  float *new_bias_ptr = new_bias->mutable_data<float>({C});
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] + (bias1_ptr[i] - mean_ptr[i]) *
-                                        inv_std_ptr[i] * scale_ptr[i];
-  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  InitBaseConvKernel(param);
-
-  // try to use faster depthwise conv
-  switch (param->ExecMode()) {
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      use_slidingwindow_add_bn_relu = true;
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      use_gemm_add_bn_relu = true;
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      const std::vector<int> &paddings = param->Paddings();
-      const std::vector<int> &strides = param->Strides();
-      if (paddings.size() == 2 && paddings[0] == paddings[1] &&
-          strides.size() == 2 && strides[0] == strides[1]) {
-        int pad = paddings[0];
-        int stride = strides[0];
-        const int win = param->Input()->dims()[3];
-        if (pad == 1) {
-          if (stride == 1) {
-            could_use_faster_depthwise_conv_ = true;
-          } else if (stride == 2 && win > 7) {
-            could_use_faster_depthwise_conv_ = true;
-          }
-        }
-      }
-      break;
-  }
-
-  if (could_use_faster_depthwise_conv_ || use_gemm_add_bn_relu ||
-      use_slidingwindow_add_bn_relu) {
-    auto filter_data = param->Filter()->data<float>();
-    auto filter_dim = param->Filter()->dims();
-    int len = 1;
-    for (int i = 0; i < filter_dim.size(); i++) {
-      len *= filter_dim[i];
-    }
-    int batch = filter_dim[0];
-    int step = len / batch;
-    for (int i = 0; i < batch; i++) {
-      for (int k = 0; k < step; k++) {
-        filter_data[i * step + k] =
-            filter_data[i * step + k] * new_scale_ptr[i];
-      }
-    }
-    if (use_gemm_add_bn_relu) {
-      ARMArch arch = framework::CPUContext::Context()->get_arch();
-      math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
-                                       param->transformed_filter_,
-                                       param->groups, arch);
-    }
-    if (use_slidingwindow_add_bn_relu) {
-      math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                  param->transformed_filter_);
-    }
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<CPU, float>::Compute(
-    const FusionConvAddBNReluParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      if (could_use_faster_depthwise_conv_) {
-        FasterDepthwiseConv3x3_bias_relu(param, param.NewBias()->data<float>(),
-                                         true);
-        fusion_has_been_computed = true;
-      } else {
-        DepthwiseConv3x3<float, float>(param);
-      }
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      fusion_has_been_computed = true;
-      GemmConv1x1s1<float, float>(param, param.NewBias()->data<float>(), true,
-                                  true);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, param.NewBias()->data<float>(),
-                                         true, true);
-      fusion_has_been_computed = true;
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                    param.NewBias(), param.Output());
-  }
-}
-
-template class ConvAddBNReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
deleted file mode 100644
index 66ed513ac97032fbdc47edd724950f0dd14c11ec..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<CPU, float>::Init(FusionConvAddParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvAddKernel<CPU, float>::Compute(const FusionConvAddParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      fusion_has_been_computed = true;
-      GemmConv1x1s1<float, float>(param, param.Bias()->data<float>(), true,
-                                  false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, param.Bias()->data<float>(),
-                                         true, false);
-      fusion_has_been_computed = true;
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    if (param.Bias()->dims() == param.Output()->dims()) {
-      math::AddElememtWise<IDENTITY>(param.Output(), param.Bias(), param.Axis(),
-                                     param.Output());
-    } else {
-      math::AddChannelWise<IDENTITY>(param.Output(), param.Bias(),
-                                     param.Output());
-    }
-  }
-}
-
-template class ConvAddKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
deleted file mode 100644
index 54eb2ca23b46347bcdda45feda6f9bfda12b3745..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<CPU, float>::Compute(
-    const FusionConvAddReluParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      fusion_has_been_computed = true;
-      GemmConv1x1s1<float, float>(param, param.Bias()->data<float>(), true,
-                                  true);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    if (param.Bias()->dims() == param.Output()->dims()) {
-      math::AddElememtWise<RELU>(param.Output(), param.Bias(), param.Axis(),
-                                 param.Output());
-    } else {
-      math::AddChannelWise<RELU>(param.Output(), param.Bias(), param.Output());
-    }
-  }
-}
-
-template class ConvAddReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
deleted file mode 100644
index 138e34d78eccb3e38eabc54323b4172fe0f47876..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include "operators/kernel/conv_bn_add_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNAddReluKernel<CPU, float>::Init(
-    FusionConvBNAddReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = const_cast<float *>(scale->data<float>());
-  auto bias_ptr = const_cast<float *>(bias->data<float>());
-
-  for (int c = 0; c < scale->numel(); ++c) {
-    float inv_scale = 1.f / (pow(variance_ptr[c] + epsilon, 0.5));
-    bias_ptr[c] -= inv_scale * scale_ptr[c] * mean_ptr[c];
-    scale_ptr[c] *= inv_scale;
-  }
-
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvBNAddReluKernel<CPU, float>::Compute(
-    const FusionConvBNAddReluParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-
-  if (param.Bias()->dims() == param.Output()->dims()) {
-    math::ScaleAddChannelWise<RELU>(param.Output(), param.InputScale(),
-                                    param.InputBias(), param.Bias(),
-                                    param.Output());
-  } else {
-    math::ScaleAddChannelWise<IDENTITY>(param.Output(), param.InputScale(),
-                                        param.InputBias(), param.Output());
-    math::AddElememtWise<RELU>(param.Output(), param.Bias(), param.Axis(),
-                               param.Output());
-  }
-}
-
-template class ConvBNAddReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
deleted file mode 100644
index f217902bf2c3d6fa4f702f9589b0cadf683bd566..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/context.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/slidingwindow_utils.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNReluKernel<CPU, float>::Init(FusionConvBNReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-
-  Variable *scale_var = param->GetScope()->Var();
-  Variable *bias_var = param->GetScope()->Var();
-  LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
-  LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
-  float *new_scale_ptr = new_scale->mutable_data<float>({C});
-  float *new_bias_ptr = new_bias->mutable_data<float>({C});
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  InitBaseConvKernel(param);
-
-  switch (param->ExecMode()) {
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      use_slidingwindow_bn_relu = true;
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      use_gemm_bn_relu = true;
-      break;
-  }
-
-  if (use_gemm_bn_relu || use_slidingwindow_bn_relu) {
-    auto filter_data = param->Filter()->data<float>();
-    auto filter_dim = param->Filter()->dims();
-    int len = 1;
-    for (int i = 0; i < filter_dim.size(); i++) {
-      len *= filter_dim[i];
-    }
-    int batch = filter_dim[0];
-    int step = len / batch;
-    for (int i = 0; i < batch; i++) {
-      for (int k = 0; k < step; k++) {
-        filter_data[i * step + k] =
-            filter_data[i * step + k] * new_scale_ptr[i];
-      }
-    }
-    if (use_gemm_bn_relu) {
-      ARMArch arch = framework::CPUContext::Context()->get_arch();
-      math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
-                                       param->transformed_filter_,
-                                       param->groups, arch);
-    }
-    if (use_slidingwindow_bn_relu) {
-      math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                  param->transformed_filter_);
-    }
-  }
-  return true;
-}
-
-template <>
-void ConvBNReluKernel<CPU, float>::Compute(
-    const FusionConvBNReluParam<CPU> &param) {
-  bool fusion_has_been_computed = false;
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, param.NewBias()->data<float>(), true,
-                                  true);
-      fusion_has_been_computed = true;
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, param.NewBias()->data<float>(),
-                                         true, true);
-      fusion_has_been_computed = true;
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  if (!fusion_has_been_computed) {
-    math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                    param.NewBias(), param.Output());
-  }
-}
-template class ConvBNReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
deleted file mode 100644
index dd3843afef39573c03544f30501446436040ad94..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "framework/context.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/slidingwindow_utils.h"
-#include "operators/math/winograd/winograd_transform.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void InitBaseConvKernel(ConvParam<CPU> *param) {
-  bool conv1x1 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-                 param->Filter()->dims()[2] == 1;
-  bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-                 param->Filter()->dims()[2] == 3;
-  bool conv5x5 = param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-                 param->Filter()->dims()[2] == 5;
-  bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] &&
-                  param->Input()->dims()[1] == param->Output()->dims()[1];
-
-  bool depth5x5 = conv5x5 && param->Groups() == param->Input()->dims()[1] &&
-                  param->Input()->dims()[1] == param->Output()->dims()[1];
-
-  if (param->Filter()->type() == type_id<int8_t>().hash_code()) {
-#ifndef __aarch64__
-    if (depth3x3 && param->Strides()[0] < 3 &&
-        param->Strides()[0] == param->Strides()[1]) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8;
-    } else if (depth5x5 && param->Strides()[0] < 2 &&
-               param->Strides()[0] == param->Strides()[1]) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE5x5_INT8;
-    } else {
-#endif  // __aarch64__
-      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_INT8;
-#ifndef __aarch64__
-    }
-#endif  // __aarch64__
-  } else {
-    if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
-        param->Strides()[0] == 1) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT;
-    } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] &&
-               param->Strides()[0] == 2) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT;
-    } else if (depth5x5 && param->Strides()[0] == param->Strides()[1] &&
-               param->Strides()[0] == 1) {
-      param->ExecMode() = ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT;
-    } else if (conv3x3 && param->Groups() == 1 &&
-               param->Strides()[0] == param->Strides()[1] &&
-               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      // transform weight
-      Variable *transformed_var = param->GetScope()->Var();
-      param->transformed_filter_ =
-          transformed_var->GetMutable<framework::LoDTensor>();
-      if (param->Input()->dims()[1] >= 32 && param->Output()->dims()[1] >= 32 &&
-          param->Output()->dims()[2] > 16 && param->Output()->dims()[3] > 16) {
-        math::winograd_transform_weight<8, 3>(*param->Filter(),
-                                              param->transformed_filter_);
-        param->ExecMode() = ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT;
-      } else {
-        math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                    param->transformed_filter_);
-        param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT;
-      }
-    } else if (conv3x3 && param->Groups() == 1 &&
-               param->Strides()[0] == param->Strides()[1] &&
-               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 2 && param->Dilations()[0] == 1) {
-      // transform weight
-      Variable *transformed_var = param->GetScope()->Var();
-      param->transformed_filter_ =
-          transformed_var->GetMutable<framework::LoDTensor>();
-      math::slidingwindow_transform_weight<float>(*param->Filter(),
-                                                  param->transformed_filter_);
-      param->ExecMode() = ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT;
-    } else if (conv1x1 && param->Groups() == 1 &&
-               param->Paddings()[0] == param->Paddings()[1] &&
-               param->Paddings()[0] == 0 && param->Input()->dims()[1] > 1 &&
-               param->Strides()[0] == param->Strides()[1] &&
-               param->Dilations()[0] == param->Dilations()[1] &&
-               param->Strides()[0] == 1 && param->Dilations()[0] == 1 &&
-               param->Output()->dims()[2] * param->Output()->dims()[3] > 1) {
-      // transform weight
-      Variable *transformed_var = param->GetScope()->Var();
-      ARMArch arch = framework::CPUContext::Context()->get_arch();
-      param->transformed_filter_ =
-          transformed_var->GetMutable<framework::LoDTensor>();
-      math::gemm1x1s1_transform_weight(*param->Filter(), *param->Output(),
-                                       param->transformed_filter_,
-                                       param->groups, arch);
-      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT;
-    } else {
-      param->ExecMode() = ConvParam<CPU>::EXEC_GEMM_FLOAT;
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_common.h b/mobile/src/operators/kernel/arm/convolution/conv_common.h
deleted file mode 100644
index 4db37715c4302439fa0e43446bd62ef68675276e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_common.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void InitBaseConvKernel(ConvParam<CPU> *param);
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
deleted file mode 100644
index f5dc35cdf60fb58999a6dde8abb696d92936eb7b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_GEMM_INT8:
-      GemmConv<int8_t, int32_t>(param);
-      break;
-#ifndef __aarch64__
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3_INT8:
-      DepthwiseConv3x3<int8_t, int32_t>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_INT8:
-      DepthwiseConv5x5<int8_t, int32_t>(param);
-      break;
-#endif  // __aarch64__
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
deleted file mode 100644
index 477bd55e553f6ba630525877c576ea82269add9e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_relu_kernel.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#include "operators/kernel/conv_relu_kernel.h"
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/activation_arm_func.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvReluKernel<CPU, float>::Init(FusionConvReluParam<CPU> *param) {
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void ConvReluKernel<CPU, float>::Compute(
-    const FusionConvReluParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_SLIDINGWINDOW3x3S2_FLOAT:
-      SlidingwindowConv3x3<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  ActivationCompute<float, RELU>()(param.Output(), param.Output());
-}
-template class ConvReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
deleted file mode 100644
index 771a846ed65e5c69090698ce813103077dedaccf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<CPU, float>::Init(ConvTransposeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<CPU, float>::Compute(
-    const ConvTransposeParam<CPU> &param) {
-  ConvTransposeCompute<float>(param);
-}
-
-template class ConvTransposeKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
deleted file mode 100644
index 0eefeae1d1d2974c761b03541a70017f0c48d64c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include "operators/kernel/dwconv_bn_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/arm/convolution/conv_common.h"
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include "operators/math/element_wise.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DWConvBNReluKernel<CPU, float>::Init(FusionDWConvBNReluParam<CPU> *param) {
-  const Tensor *mean = param->InputMean();
-  const Tensor *variance = param->InputVariance();
-  const Tensor *scale = param->InputScale();
-  const Tensor *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  Variable *scale_var = param->GetScope()->Var();
-  Variable *bias_var = param->GetScope()->Var();
-  LoDTensor *new_scale = scale_var->GetMutable<LoDTensor>();
-  LoDTensor *new_bias = bias_var->GetMutable<LoDTensor>();
-  float *new_scale_ptr = new_scale->mutable_data<float>({C});
-  float *new_bias_ptr = new_bias->mutable_data<float>({C});
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  InitBaseConvKernel(param);
-  return true;
-}
-
-template <>
-void DWConvBNReluKernel<CPU, float>::Compute(
-    const FusionDWConvBNReluParam<CPU> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S1_FLOAT:
-    case ConvParam<CPU>::EXEC_DEPTHWISE3x3S2_FLOAT:
-      DepthwiseConv3x3<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_DEPTHWISE5x5_FLOAT:
-      DepthwiseConv5x5<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<8, 3>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM_FLOAT:
-      GemmConv<float, float>(param);
-      break;
-    case ConvParam<CPU>::EXEC_GEMM1x1s1_FLOAT:
-      GemmConv1x1s1<float, float>(param, nullptr, false, false);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-  math::ScaleAddChannelWise<RELU>(param.Output(), param.NewScale(),
-                                  param.NewBias(), param.Output());
-}
-
-template class DWConvBNReluKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/crf_kernel.cpp b/mobile/src/operators/kernel/arm/crf_kernel.cpp
deleted file mode 100644
index d30c28b3576e2a8a8a108ae6c86edc2f4310b83f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/crf_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#include "operators/kernel/crf_kernel.h"
-#include "common/types.h"
-#include "operators/kernel/central-arm-func/crf_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool CrfKernel<CPU, float>::Init(CrfParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void CrfKernel<CPU, float>::Compute(const CrfParam<CPU> &param) {
-  CrfCompute<float>(param);
-}
-
-template class CrfKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp b/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
deleted file mode 100644
index 8aff3984e883732f2aa637adacc1a4d506c521c9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/density_prior_box_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DENSITY_PRIORBOX_OP
-
-#include "operators/kernel/central-arm-func/density_prior_box_arm_func.h"
-#include "operators/kernel/prior_box_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DensityPriorBoxKernel<CPU, float>::Init(DensityPriorBoxParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void DensityPriorBoxKernel<CPU, float>::Compute(
-    const DensityPriorBoxParam<CPU> &param) {
-  DensityPriorBoxCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // DENSITY_PRIORBOX_OP
diff --git a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp b/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
deleted file mode 100644
index 4fa00f3a378e7f715c8435ab56ddc81e6124d39a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/dequantize_bn_kernel.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 201f8 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include "operators/kernel/dequant_bn_kernel.h"
-#include "operators/math/activation.h"
-#include "operators/math/quantize.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-void PublicFusionDequantBNInitParam(FusionDequantBNParam<CPU> *param,
-                                    const framework::Tensor *bias) {
-  // batch norm params
-  const Tensor *bn_mean = param->bn_mean_;
-  const Tensor *bn_variance = param->bn_variance_;
-  Tensor *bn_scale = param->bn_scale_;
-  Tensor *bn_bias = param->bn_bias_;
-  const float epsilon = param->epsilon_;
-
-  const float *mean_ptr = bn_mean->data<float>();
-  const float *var_ptr = bn_variance->data<float>();
-  float *bn_scale_ptr = bn_scale->mutable_data<float>();
-  float *bn_bias_ptr = bn_bias->mutable_data<float>();
-  for (int c = 0; c < bn_scale->numel(); ++c) {
-    float inv_scale = 1.f / (std::sqrt(var_ptr[c] + epsilon));
-    float val = bias ? bias->data<float>()[c] : 0;
-    bn_bias_ptr[c] =
-        inv_scale * bn_scale_ptr[c] * (val - mean_ptr[c]) + bn_bias_ptr[c];
-    bn_scale_ptr[c] = inv_scale * bn_scale_ptr[c];
-  }
-}
-#endif
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP)
-template <ActivationType Act>
-void DequantBNCompute(const FusionDequantBNParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-
-  float *output = param->output_->mutable_data<float>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      // not fuse bn and dequant scale to minimize precision difference
-      // float scale = bn_scale[c] * dequant_scale;
-      float scale = bn_scale[c];
-      float bias = bn_bias[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const int32_t *x = input + offset;
-      float *y = output + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        int32x4_t r0 = vld1q_s32(x);
-        int32x4_t r1 = vld1q_s32(x + 4);
-        int32x4_t r2 = vld1q_s32(x + 8);
-        int32x4_t r3 = vld1q_s32(x + 12);
-        float32x4_t f0 = vcvtq_f32_s32(r0);
-        float32x4_t f1 = vcvtq_f32_s32(r1);
-        float32x4_t f2 = vcvtq_f32_s32(r2);
-        float32x4_t f3 = vcvtq_f32_s32(r3);
-        f0 = vmulq_f32(__dequant_scale, f0);
-        f1 = vmulq_f32(__dequant_scale, f1);
-        f2 = vmulq_f32(__dequant_scale, f2);
-        f3 = vmulq_f32(__dequant_scale, f3);
-        f0 = vmlaq_f32(__bias, __scale, f0);
-        f1 = vmlaq_f32(__bias, __scale, f1);
-        f2 = vmlaq_f32(__bias, __scale, f2);
-        f3 = vmlaq_f32(__bias, __scale, f3);
-        f0 = math::vActiveq_f32<Act>(f0);
-        f1 = math::vActiveq_f32<Act>(f1);
-        f2 = math::vActiveq_f32<Act>(f2);
-        f3 = math::vActiveq_f32<Act>(f3);
-        vst1q_f32(y, f0);
-        vst1q_f32(y + 4, f1);
-        vst1q_f32(y + 8, f2);
-        vst1q_f32(y + 12, f3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
-      }
-    }
-  }
-}
-#endif
-
-#ifdef FUSION_DEQUANT_BN_OP
-template <>
-bool FusionDequantBNKernel<CPU, float>::Init(FusionDequantBNParam<CPU> *param) {
-  PublicFusionDequantBNInitParam(param, nullptr);
-  return true;
-}
-
-template <>
-void FusionDequantBNKernel<CPU, float>::Compute(
-    const FusionDequantBNParam<CPU> &param) {
-  DequantBNCompute<IDENTITY>(&param);
-}
-#endif  // FUSION_DEQUANT_BN_OP
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-template <>
-bool FusionDequantBNReluKernel<CPU, float>::Init(
-    FusionDequantBNParam<CPU> *param) {
-  PublicFusionDequantBNInitParam(param, nullptr);
-  return true;
-}
-
-template <>
-void FusionDequantBNReluKernel<CPU, float>::Compute(
-    const FusionDequantBNParam<CPU> &param) {
-  DequantBNCompute<RELU>(&param);
-}
-#endif  // FUSION_DEQUANT_BN_RELU_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-template <>
-bool FusionDequantAddBNKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  DequantBNCompute<IDENTITY>(&param);
-}
-#endif  // FUSION_DEQUANT_ADD_BN_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-template <>
-bool FusionDequantAddBNReluKernel<CPU, float>::Init(
-    FusionDequantAddBNParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNReluKernel<CPU, float>::Compute(
-    const FusionDequantAddBNParam<CPU> &param) {
-  DequantBNCompute<RELU>(&param);
-}
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_OP
-
-#if defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-template <Activation Act, RoundType R>
-void DequantBNQuantCompute(const FusionDequantAddBNQuantParam<CPU> *param) {
-  const int32_t *input = param->input_->data<int32_t>();
-  const float *bn_scale = param->bn_scale_->data<float>();
-  const float *bn_bias = param->bn_bias_->data<float>();
-  // dequantize params
-  const float activation_scale = param->activation_scale_->data<float>()[0];
-  const float weight_scale = param->weight_scale_;
-  const float dequant_scale = activation_scale / weight_scale;
-  // quantize params
-  Tensor *output_scale = param->online_scale_;
-  float max_abs = 0.f;
-
-  int8_t *output = param->output_->mutable_data<int8_t>();
-  int batch_size = param->input_->dims()[0];
-  int channels = param->input_->dims()[1];
-  size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3];
-
-  //  if (param->is_static_) {
-  if (true) {
-    max_abs = param->static_scale_;
-    float quant_scale = 127.f / max_abs;
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < batch_size; ++batch) {
-      for (int c = 0; c < channels; ++c) {
-        // not fuse bn and dequant scale to minimize precision difference
-        // float scale = bn_scale[c] * dequant_scale;
-        float scale = bn_scale[c];
-        float bias = bn_bias[c];
-        size_t offset = (batch * channels + c) * spatial_size;
-        const int32_t *x = input + offset;
-        int8_t *y = output + offset;
-        size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-        int loop = spatial_size >> 4;
-        remain = spatial_size & 0xF;
-        float32x4_t __dequant_scale = vdupq_n_f32(dequant_scale);
-        float32x4_t __scale = vdupq_n_f32(scale);
-        float32x4_t __bias = vdupq_n_f32(bias);
-        float32x4_t __quant_scale = vdupq_n_f32(quant_scale);
-        for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-          int32x4_t r0 = vld1q_s32(x);
-          int32x4_t r1 = vld1q_s32(x + 4);
-          int32x4_t r2 = vld1q_s32(x + 8);
-          int32x4_t r3 = vld1q_s32(x + 12);
-          float32x4_t f0 = vcvtq_f32_s32(r0);
-          float32x4_t f1 = vcvtq_f32_s32(r1);
-          float32x4_t f2 = vcvtq_f32_s32(r2);
-          float32x4_t f3 = vcvtq_f32_s32(r3);
-          f0 = vmulq_f32(__dequant_scale, f0);
-          f1 = vmulq_f32(__dequant_scale, f1);
-          f2 = vmulq_f32(__dequant_scale, f2);
-          f3 = vmulq_f32(__dequant_scale, f3);
-          f0 = vmlaq_f32(__bias, __scale, f0);
-          f1 = vmlaq_f32(__bias, __scale, f1);
-          f2 = vmlaq_f32(__bias, __scale, f2);
-          f3 = vmlaq_f32(__bias, __scale, f3);
-          f0 = math::vActiveq_f32<Act>(f0);
-          f1 = math::vActiveq_f32<Act>(f1);
-          f2 = math::vActiveq_f32<Act>(f2);
-          f3 = math::vActiveq_f32<Act>(f3);
-          f0 = vmulq_f32(__quant_scale, f0);
-          f1 = vmulq_f32(__quant_scale, f1);
-          f2 = vmulq_f32(__quant_scale, f2);
-          f3 = vmulq_f32(__quant_scale, f3);
-          int32x4_t q0 = math::vRoundq_f32<R>(f0);
-          int32x4_t q1 = math::vRoundq_f32<R>(f1);
-          int32x4_t q2 = math::vRoundq_f32<R>(f2);
-          int32x4_t q3 = math::vRoundq_f32<R>(f3);
-          int16x4_t d0 = vmovn_s32(q0);
-          int16x4_t d1 = vmovn_s32(q1);
-          int16x4_t d2 = vmovn_s32(q2);
-          int16x4_t d3 = vmovn_s32(q3);
-          int16x8_t q5 = vcombine_s16(d0, d1);
-          int16x8_t q6 = vcombine_s16(d2, d3);
-          int8x8_t d5 = vmovn_s16(q5);
-          int8x8_t d6 = vmovn_s16(q6);
-          vst1_s8(y, d5);
-          vst1_s8(y + 8, d6);
-        }
-#endif  // __ARM_NEON__
-        for (int k = 0; k < remain; ++k) {
-          float x_temp =
-              math::Active<Act>(scale * (dequant_scale * x[k]) + bias);
-          y[k] = math::Round<R>(x_temp * quant_scale);
-        }
-      }
-    }
-  } else {
-    // TODO(hjchen2)
-    max_abs = std::max(max_abs, 1e-6f);
-  }
-  param->online_scale_->mutable_data<float>()[0] = max_abs;
-}
-
-template <>
-bool FusionDequantAddBNQuantKernel<CPU, float>::Init(
-    FusionDequantAddBNQuantParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNQuantKernel<CPU, float>::Compute(
-    const FusionDequantAddBNQuantParam<CPU> &param) {
-  switch (param.round_type_) {
-    case ROUND_NEAREST_TO_EVEN:
-      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TO_EVEN>(&param);
-      break;
-    case ROUND_NEAREST_TOWARDS_ZERO:
-      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_TOWARDS_ZERO>(&param);
-      break;
-    case ROUND_NEAREST_AWAY_ZERO:
-      DequantBNQuantCompute<IDENTITY, ROUND_NEAREST_AWAY_ZERO>(&param);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "round type is not supported.";
-      break;
-  }
-}
-#endif  // FUSION_DEQUANT_ADD_BN_QUANT_OP
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-template <>
-bool FusionDequantAddBNReluQuantKernel<CPU, float>::Init(
-    FusionDequantAddBNQuantParam<CPU> *param) {
-  const framework::Tensor *bias = param->bias_;
-  PublicFusionDequantBNInitParam(param, bias);
-  return true;
-}
-
-template <>
-void FusionDequantAddBNReluQuantKernel<CPU, float>::Compute(
-    const FusionDequantAddBNQuantParam<CPU> &param) {
-  switch (param.round_type_) {
-    case ROUND_NEAREST_TO_EVEN:
-      DequantBNQuantCompute<RELU, ROUND_NEAREST_TO_EVEN>(&param);
-      break;
-    case ROUND_NEAREST_TOWARDS_ZERO:
-      DequantBNQuantCompute<RELU, ROUND_NEAREST_TOWARDS_ZERO>(&param);
-      break;
-    case ROUND_NEAREST_AWAY_ZERO:
-      DequantBNQuantCompute<RELU, ROUND_NEAREST_AWAY_ZERO>(&param);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "round type is not supported.";
-      break;
-  }
-}
-#endif  // FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp b/mobile/src/operators/kernel/arm/dequantize_kernel.cpp
deleted file mode 100644
index 7c0d1cea18c90145c5dbc06de7cb97fa5ed289b6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/dequantize_kernel.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#include "operators/kernel/dequantize_kernel.h"
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void DequantizeKernel<CPU, float>::Compute(const DequantizeParam<CPU> &param) {
-  const LoDTensor *input = param.input_;
-  LoDTensor *output = param.output_;
-  float activation_scale = param.activation_scale_->data<float>()[0];
-  float weight_scale = param.weight_scale_;
-  const int32_t *x = input->data<int32_t>();
-  float *y = output->mutable_data<float>();
-  size_t size = output->numel();
-  // float scale = 1.f / (activation_scale * weight_scale);
-  float scale = activation_scale / weight_scale;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = size >> 4;
-  size_t remain = size & 0xF;
-  float32x4_t s = vdupq_n_f32(scale);
-
-  #pragma omp parallel for
-  for (size_t i = 0; i < loop; ++i) {
-    const int32_t *local_x = x + (i << 4);
-    float *local_y = y + (i << 4);
-    int32x4_t r0 = vld1q_s32(local_x);
-    int32x4_t r1 = vld1q_s32(local_x + 4);
-    int32x4_t r2 = vld1q_s32(local_x + 8);
-    int32x4_t r3 = vld1q_s32(local_x + 12);
-    float32x4_t f0 = vcvtq_f32_s32(r0);
-    float32x4_t f1 = vcvtq_f32_s32(r1);
-    float32x4_t f2 = vcvtq_f32_s32(r2);
-    float32x4_t f3 = vcvtq_f32_s32(r3);
-    f0 = vmulq_f32(f0, s);
-    f1 = vmulq_f32(f1, s);
-    f2 = vmulq_f32(f2, s);
-    f3 = vmulq_f32(f3, s);
-    vst1q_f32(local_y, f0);
-    vst1q_f32(local_y + 4, f1);
-    vst1q_f32(local_y + 8, f2);
-    vst1q_f32(local_y + 12, f3);
-  }
-  size = remain;
-  x += (loop << 4);
-  y += (loop << 4);
-#endif
-  for (size_t i = 0; i < size; ++i) {
-    y[i] = x[i] * scale;
-  }
-  output->set_lod(input->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/dropout_kernel.cpp b/mobile/src/operators/kernel/arm/dropout_kernel.cpp
deleted file mode 100644
index 964773ad696ea53fccec62a394f00fa70daf7145..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/dropout_kernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-#include <operators/math/transform.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<CPU, float>::Init(DropoutParam<CPU> *para) {
-  return true;
-}
-
-template <typename T>
-struct DropoutFunctor {
-  explicit DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {}
-  inline T operator()(T in) const { return (1 - dropout_pro_) * in; }
-
- private:
-  T dropout_pro_;
-};
-
-template <>
-void DropoutKernel<CPU, float>::Compute(const DropoutParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  auto *input_x_ptr = input_x->data<float>();
-  auto *out = param.Out();
-  auto *out_ptr = out->mutable_data<float>();
-  const float dropoutProb = param.DropoutProb();
-  DropoutFunctor<float> func_(dropoutProb);
-  math::Transform trans;
-  trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
deleted file mode 100644
index c4bcbf6f7e8a33d7533b17b2ca00e00411aaeef3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-#include "operators/kernel/central-arm-func/elementwise_add_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<CPU, float>::Init(ElementwiseAddParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseAddKernel<CPU, float>::Compute(
-    const ElementwiseAddParam<CPU> &param) {
-  if (param.InputX()->type() == type_id<float>().hash_code()) {
-    ElementwiseAddCompute<float>(param);
-  } else if (param.InputX()->type() == type_id<int>().hash_code()) {
-    AddElememtWiseStruct<int, IDENTITY>()(param.InputX(), param.InputY(),
-                                          param.Axis(), param.Out());
-  }
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
deleted file mode 100644
index 9c245707da31d07e2419439c68343f7014beb416..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include "operators/kernel/central-arm-func/elementwise_mul_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseMulKernel<CPU, float>::Init(ElementwiseMulParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseMulKernel<CPU, float>::Compute(
-    const ElementwiseMulParam<CPU> &param) {
-  ElementwiseMulCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
deleted file mode 100644
index 30f607155c4a91f4f523c6596f09c2379970108c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/elementwise_sub_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#include "operators/kernel/elementwise_sub_kernel.h"
-#include "operators/kernel/central-arm-func/elementwise_sub_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseSubKernel<CPU, float>::Init(ElementwiseSubParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ElementwiseSubKernel<CPU, float>::Compute(
-    const ElementwiseSubParam<CPU> &param) {
-  ElementwiseSubCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/exp_kernel.cpp b/mobile/src/operators/kernel/arm/exp_kernel.cpp
deleted file mode 100644
index 0323a2b0455128e3e1c3e727fabaa13978ca3b38..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/exp_kernel.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by hujie09 on 2019-07-16.
-//
-
-#ifdef EXP_OP
-#pragma once
-
-#include <math.h>
-#include <operators/kernel/exp_kernel.h>
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool EXPKernel<CPU, float>::Init(
-    paddle_mobile::operators::EXPParam<paddle_mobile::CPU> *param) {
-  return true;
-}
-
-template <>
-void EXPKernel<CPU, float>::Compute(
-    const paddle_mobile::operators::EXPParam<paddle_mobile::CPU> &param) {
-  const auto input_ = param.InputX();
-  auto output = param.Out();
-  float *output_data = output->mutable_data<float>();
-  const float *input_data = input_->data<float>();
-  for (int i = 0; i < output->numel(); ++i, output_data++, input_data++) {
-    *output_data = exp(*input_data);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // EXP_OP
diff --git a/mobile/src/operators/kernel/arm/feed_kernel.cpp b/mobile/src/operators/kernel/arm/feed_kernel.cpp
deleted file mode 100644
index 26ea2ac5f7d806aa6e69dfe9697ed84b61347c0e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/feed_kernel.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<CPU, float>::Init(FeedParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FeedKernel<CPU, float>::Compute(const FeedParam<CPU> &param) {
-  int col = param.Col();
-  param.Out()->ShareDataWith(param.InputX()->at(col));
-  param.Out()->set_lod(param.InputX()->at(col).lod());
-}
-
-template class FeedKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/fetch_kernel.cpp b/mobile/src/operators/kernel/arm/fetch_kernel.cpp
deleted file mode 100644
index 8a97fa934b45bf93d7cc727cadbea7cf5ab310f1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/fetch_kernel.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-     http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/fetch_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<CPU, float>::Init(FetchParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FetchKernel<CPU, float>::Compute(const FetchParam<CPU> &param) {
-  int col = param.Col();
-  param.Out()->at(col).ShareDataWith(*(param.InputX()));
-}
-
-template class FetchKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/flatten_kernel.cpp b/mobile/src/operators/kernel/arm/flatten_kernel.cpp
deleted file mode 100644
index 4d00e494544557ce05f2af16bb59979ea2b8927f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/flatten_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#include "operators/kernel/flatten_kernel.h"
-#include "operators/kernel/central-arm-func/flatten_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FlattenKernel<CPU, float>::Init(FlattenParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FlattenKernel<CPU, float>::Compute(const FlattenParam<CPU> &param) {
-  FlattenCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
deleted file mode 100644
index 54ad5f788b93aec19159f8d5ff34f903b53c7e5b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "operators/kernel/central-arm-func/fusion_fc_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
-  int M = (int)param->InputX()->dims()[0];
-  if (M == 1) {
-    int r = param->InputY()->dims()[0];
-    int c = param->InputY()->dims()[1];
-    float *B = param->InputY()->data<float>();
-    framework::Tensor matrix_trans;
-    float *trans_b = matrix_trans.mutable_data<float>({r, c});
-    int index = 0;
-    for (int j = 0; j < c; j++) {
-      for (int i = 0; i < r; i++) {
-        trans_b[index++] = B[i * c + j];
-      }
-    }
-    index = 0;
-    for (int j = 0; j < c; j++) {
-      for (int i = 0; i < r; i++) {
-        B[index] = trans_b[index];
-        index++;
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
-  FusionFcCompute<float, float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class FusionFcKernel<CPU, float>;
-
-#ifdef FUSION_FC_INT8_OP
-template <>
-bool FusionFcKernel<CPU, int8_t>::Init(FusionFcParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void FusionFcKernel<CPU, int8_t>::Compute(const FusionFcParam<CPU> &param) {
-  FusionFcCompute<int8_t, int32_t>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class FusionFcKernel<CPU, int8_t>;
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/gru_kernel.cpp b/mobile/src/operators/kernel/arm/gru_kernel.cpp
deleted file mode 100644
index 15459c8251fcff5a9ec63549206ed545d99a7a39..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/gru_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#include "operators/kernel/gru_kernel.h"
-#include "operators/kernel/central-arm-func/gru_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool GruKernel<CPU, float>::Init(GruParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void GruKernel<CPU, float>::Compute(const GruParam<CPU> &param) {
-  GruCompute<float>(param);
-  param.OutHidden()->set_lod(param.InputInput()->lod());
-}
-
-template class GruKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp b/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
deleted file mode 100644
index bf20f25d7241449abb03a40a7dfd352bc23643af..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/gru_unit_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#include "operators/kernel/gru_unit_kernel.h"
-#include "operators/kernel/central-arm-func/gru_unit_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool GruUnitKernel<CPU, float>::Init(GruUnitParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void GruUnitKernel<CPU, float>::Compute(const GruUnitParam<CPU> &param) {
-  GruUnitCompute<float>(param);
-}
-
-template class GruUnitKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp b/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
deleted file mode 100644
index 07ce0314fa08467d4fc63bc0745a49b8a3b2f263..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/im2sequence_kernel.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#include "operators/kernel/im2sequence_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Im2SequenceKernel<CPU, float>::Init(Im2SequenceParam<CPU> *para) {
-  return true;
-}
-
-inline int Im2SeqOutputSize(int input_size, int filter_size, int padding_0,
-                            int padding_1, int stride) {
-  const int output_size =
-      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
-  return output_size;
-}
-
-template <>
-void Im2SequenceKernel<CPU, float>::Compute(
-    const Im2SequenceParam<CPU> &param) {
-  const Tensor *in_x = param.Input();
-  framework::LoDTensor *out = param.Output();
-  out->mutable_data<float>();
-
-  std::vector<int> kernels = param.Kernels();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-
-  auto in_x_dim = in_x->dims();
-  const int batch_size = static_cast<int>(in_x_dim[0]);
-  const int img_channels = static_cast<int>(in_x_dim[1]);
-  const int img_height = static_cast<int>(in_x_dim[2]);
-  const int img_width = static_cast<int>(in_x_dim[3]);
-
-  int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
-                                       paddings[2], strides[0]);
-  int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
-                                      paddings[3], strides[1]);
-
-  out->mutable_data<float>({batch_size * output_height * output_width,
-                            img_channels * kernels[0] * kernels[1]});
-  const std::vector<int> dilations({1, 1});
-  // TODO(): verify
-  auto out_dims = out->dims();
-  out->Resize({batch_size, out->numel() / batch_size});
-  for (int i = 0; i < batch_size; i++) {
-    const Tensor src =
-        in_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-    Tensor dst = out->Slice(i, i + 1).Resize(
-        {output_height, output_width, img_channels, kernels[0], kernels[1]});
-    math::Im2ColFunctor<math::ColFormat::kOCF, CPU, float> f;
-    f(src, dilations, strides, paddings, &dst);
-  }
-  out->Resize(out_dims);
-  framework::LoD lod(1);
-  lod[0].reserve(batch_size + 1);
-  int offset = 0;
-  lod[0].push_back(offset);
-  for (int i = 0; i < batch_size; ++i) {
-    offset += output_height * output_width;
-    lod[0].push_back(offset);
-  }
-  out->set_lod(lod);
-}
-
-template class Im2SequenceKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/increment_kernel.cpp b/mobile/src/operators/kernel/arm/increment_kernel.cpp
deleted file mode 100644
index 27fd48d084a1c29289dd6e8755cee860208d12f7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/increment_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#include "operators/kernel/increment_kernel.h"
-#include <operators/kernel/central-arm-func/increment_arm_func.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool IncrementKernel<CPU, float>::Init(IncrementParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void IncrementKernel<CPU, float>::Compute(const IncrementParam<CPU> &param) {
-  IncrementCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp b/mobile/src/operators/kernel/arm/is_empty_kernel.cpp
deleted file mode 100644
index 070d3d16d7813584c490c868333d69a9a11afde9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/is_empty_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#include "operators/kernel/is_empty_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool IsEmptyKernel<CPU, float>::Init(IsEmptyParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void IsEmptyKernel<CPU, float>::Compute(const IsEmptyParam<CPU> &param) {
-  const framework::Tensor *input = param.InputX();
-  framework::Tensor *out = param.Out();
-  out->mutable_data<bool>()[0] = input->numel() == 0;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp b/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
deleted file mode 100644
index 264611be01f04de5b1f4ff7a5ed7ad3b6d1b5d2d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/lod_reset_kernel.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOD_RESET_OP
-
-#include <algorithm>
-#include "operators/kernel/kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LodResetKernel<CPU, float>::Init(LodResetParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LodResetKernel<CPU, float>::Compute(const LodResetParam<CPU> &param) {
-  const auto *input = param.input_x_;
-  const auto *lod_t = param.input_y_;
-  bool append = param.append;
-  auto *output = param.output_;
-
-  output->ShareDataWith(*input);
-
-  std::vector<int> level0;
-  if (lod_t) {
-    if (lod_t->lod().size() > 0) {
-      output->set_lod(lod_t->lod());
-      return;  // early return, since lod already set
-    } else {
-      auto *lod = lod_t->data<int>();
-      level0 = std::vector<int>(lod, lod + lod_t->numel());
-    }
-  } else {
-    level0 = param.target_lod_;
-  }
-
-  // cast level0 to size_t
-  std::vector<size_t> ulevel0(level0.size(), 0);
-  std::transform(level0.begin(), level0.end(), ulevel0.begin(),
-                 [](int a) { return static_cast<size_t>(a); });
-
-  if (append) {
-    auto *out_lod = output->mutable_lod();
-    out_lod->push_back(ulevel0);
-  } else {
-    framework::LoD target_lod;
-    target_lod.push_back(ulevel0);
-    output->set_lod(target_lod);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // LOD_RESET_OP
diff --git a/mobile/src/operators/kernel/arm/logical_kernel.cpp b/mobile/src/operators/kernel/arm/logical_kernel.cpp
deleted file mode 100644
index 3cffcf5c691fde551ad01f42757da8eaae98833e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/logical_kernel.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/logical_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct LogicalAndFunctor {
-  bool operator()(const T& a, const T& b) const { return a && b; }
-};
-
-template <typename T>
-struct LogicalOrFunctor {
-  bool operator()(const T& a, const T& b) const { return a || b; }
-};
-
-template <typename T>
-struct LogicalNotFunctor {
-  bool operator()(const T& a) const { return !a; }
-};
-
-template <typename T>
-struct LogicalXorFunctor {
-  bool operator()(const T& a, const T& b) const {
-    return (a || b) && !(a && b);
-  }
-};
-
-template <typename T, typename Functor>
-void UnaryLogicalCompute(const Tensor* inputX, Tensor* output) {
-  Functor func;
-  std::transform(inputX->data<T>(), inputX->data<T>() + inputX->numel(),
-                 output->data<T>(), func);
-}
-
-template <typename T, typename Functor>
-void BinaryLogicalCompute(const Tensor* inputX, const Tensor* inputY,
-                          Tensor* output) {
-  Functor func;
-  std::transform(inputX->data<T>(), inputX->data<T>() + inputX->numel(),
-                 inputY->data<T>(), output->data<T>(), func);
-}
-
-#ifdef LOGICAL_AND_OP
-template <>
-bool LogicalAndKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalAndKernel<CPU, float>::Compute(
-    const LogicalBinaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* inputY = param.InputY();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  BinaryLogicalCompute<bool, LogicalAndFunctor<bool>>(inputX, inputY, out);
-}
-#endif
-
-#ifdef LOGICAL_OR_OP
-template <>
-bool LogicalOrKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalOrKernel<CPU, float>::Compute(
-    const LogicalBinaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* inputY = param.InputY();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  BinaryLogicalCompute<bool, LogicalOrFunctor<bool>>(inputX, inputY, out);
-}
-#endif
-
-#ifdef LOGICAL_NOT_OP
-template <>
-bool LogicalNotKernel<CPU, float>::Init(LogicalUnaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalNotKernel<CPU, float>::Compute(
-    const LogicalUnaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  UnaryLogicalCompute<bool, LogicalNotFunctor<bool>>(inputX, out);
-}
-#endif
-
-#ifdef LOGICAL_XOR_OP
-template <>
-bool LogicalXorKernel<CPU, float>::Init(LogicalBinaryParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void LogicalXorKernel<CPU, float>::Compute(
-    const LogicalBinaryParam<CPU>& param) {
-  auto* inputX = param.InputX();
-  auto* inputY = param.InputY();
-  auto* out = param.Out();
-  out->mutable_data<bool>();
-  BinaryLogicalCompute<bool, LogicalXorFunctor<bool>>(inputX, inputY, out);
-}
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/lookup_kernel.cpp b/mobile/src/operators/kernel/arm/lookup_kernel.cpp
deleted file mode 100644
index 0e6df6ab6bf19f67b0c5f5a873d4a47215167e45..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/lookup_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef LOOKUP_OP
-
-#include "operators/kernel/lookup_kernel.h"
-#include "operators/kernel/central-arm-func/lookup_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LookupKernel<CPU, float>::Init(LookupParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LookupKernel<CPU, float>::Compute(const LookupParam<CPU> &param) {
-  LookupCompute<float>(param);
-  param.Out()->set_lod(param.InputIds()->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/lrn_kernel.cpp b/mobile/src/operators/kernel/arm/lrn_kernel.cpp
deleted file mode 100644
index bf049053fc5b9157f24c50233742eea3c0ca2de1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/lrn_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#include "operators/kernel/lrn_kernel.h"
-#include "operators/kernel/central-arm-func/lrn_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LrnKernel<CPU, float>::Init(LrnParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void LrnKernel<CPU, float>::Compute(const LrnParam<CPU> &param) {
-  LrnCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/mul_kernel.cpp b/mobile/src/operators/kernel/arm/mul_kernel.cpp
deleted file mode 100644
index 59d16600d71d247c42bb7625a3dddd5952a33705..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/mul_kernel.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "operators/kernel/mul_kernel.h"
-#include "operators/kernel/central-arm-func/mul_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<CPU, float>::Init(MulParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void MulKernel<CPU, float>::Compute(const MulParam<CPU> &param) {
-  MulCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class MulKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp b/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
deleted file mode 100644
index 61638da0051c7b27b695752c445f0fd6b20114b5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/multiclass_nms_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/kernel/multiclass_nms_kernel.h"
-#include "operators/kernel/central-arm-func/multiclass_nms_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MultiClassNMSKernel<CPU, float>::Init(MultiClassNMSParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void MultiClassNMSKernel<CPU, float>::Compute(
-    const MultiClassNMSParam<CPU> &param) {
-  MultiClassNMSCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp b/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
deleted file mode 100644
index d412ec1a5d712aa85b2dfed4735295d2105d06a6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/nearest_interp_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#include "operators/kernel/nearest_interp_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool NearestInterpolationKernel<CPU, float>::Init(
-    NearestInterpolationParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void NearestInterpolationKernel<CPU, float>::Compute(
-    const NearestInterpolationParam<CPU>& param) {
-  auto out_dims = param.Out()->dims();
-  auto* input = param.InputX()->data<float>();
-  auto out_size_t = param.InputOutPutSize();
-
-  int out_h = param.OutH();
-  int out_w = param.OutW();
-  if (out_size_t != nullptr) {
-    auto out_size_data = out_size_t->data<int>();
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-  auto* output = param.Out()->mutable_data<float>(
-      {out_dims[0], out_dims[1], out_h, out_w});
-  auto batch_size = param.InputX()->dims()[0];
-  auto channels = param.InputX()->dims()[1];
-  auto in_h = param.InputX()->dims()[2];
-  auto in_w = param.InputX()->dims()[3];
-
-  auto in_hw = in_h * in_w;
-  auto out_hw = out_h * out_w;
-  auto in_chw = channels * in_hw;
-  auto out_chw = channels * out_hw;
-
-  float ratio_h =
-      (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-  float ratio_w =
-      (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-  if (in_h == out_h && in_w == out_w) {
-    memcpy(output, input, param.InputX()->numel() * sizeof(float));
-  } else {
-    for (int k = 0; k < batch_size; ++k) {  // loop for batches
-      for (int i = 0; i < out_h; ++i) {     // loop for images
-        int h = ratio_h * i + 0.5f;
-
-        for (int j = 0; j < out_w; ++j) {
-          int w = ratio_w * j + 0.5f;
-
-          // calculate four position for bilinear interpolation
-          const float* in_pos = &input[k * in_chw + h * in_w + w];
-          float* out_pos = &output[k * out_chw + i * out_w + j];
-
-          for (int c = 0; c < channels; ++c) {  // loop for channels
-            // nearest interpolation
-            out_pos[0] = in_pos[0];
-            in_pos += in_hw;
-            out_pos += out_hw;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/norm_kernel.cpp b/mobile/src/operators/kernel/arm/norm_kernel.cpp
deleted file mode 100644
index 32617992cb1a60b44265343092f15316ea087df1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/norm_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#include "operators/kernel/norm_kernel.h"
-#include "operators/kernel/central-arm-func/norm_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool NormKernel<CPU, float>::Init(NormParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void NormKernel<CPU, float>::Compute(const NormParam<CPU> &param) {
-  NormCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp b/mobile/src/operators/kernel/arm/one_hot_kernel.cpp
deleted file mode 100644
index 208b34ea2cd7a2357870c08d27fdcfd164380d0c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/one_hot_kernel.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#include "operators/kernel/one_hot_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename InT>
-struct OnehotOpFunctor {
-  const framework::LoDTensor* in_;
-  framework::LoDTensor* out_;
-  int depth_;
-
-  OnehotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
-                  int depth)
-      : in_(in), out_(out), depth_(depth) {}
-
-  template <typename OutT>
-  void apply() const {
-    auto* p_in_data = in_->data<InT>();
-    auto numel = in_->numel();
-    auto* p_out_data = out_->mutable_data<OutT>();
-    memset(p_out_data, 0, out_->numel() * sizeof(OutT));
-
-    for (int i = 0; i < numel; ++i) {
-      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
-    }
-  }
-};
-
-template <>
-bool OnehotKernel<CPU, float>::Init(OnehotParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void OnehotKernel<CPU, float>::Compute(const OnehotParam<CPU>& param) {
-  framework::VisitDataType(
-      framework::ToDataType(param.dtype_),
-      OnehotOpFunctor<int64_t>(param.input_, param.output_, param.depth_));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp b/mobile/src/operators/kernel/arm/pad2d_kernel.cpp
deleted file mode 100755
index f71058519c345a795d3759c93c4253a2c33688a2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/pad2d_kernel.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#include "operators/kernel/pad2d_kernel.h"
-#include "operators/math/pad.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Pad2DKernel<CPU, float>::Init(Pad2DParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void Pad2DKernel<CPU, float>::Compute(const Pad2DParam<CPU> &param) {
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  const auto &paddings = param.paddings_;
-  //  if (param.mode_ == "constant" && param.pad_value_ == 0) {
-  math::PadFunctor<CPU, float> pad;
-  pad(*input, paddings[0], paddings[1], paddings[2], paddings[3], output);
-  //  } else {
-  //    PADDLE_MOBILE_THROW_EXCEPTION("Pad2D has not been implemented.");
-  //  }
-  output->set_lod(input->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp b/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
deleted file mode 100644
index 1ae11aba41f1b2dbd9207e0808990a262bb80f56..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/polygon_box_transform_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#include "operators/kernel/polygon_box_transform_kernel.h"
-#include "operators/kernel/central-arm-func/polygon_box_transform_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PolygonBoxTransformKernel<CPU, float>::Init(
-    PolygonBoxTransformParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PolygonBoxTransformKernel<CPU, float>::Compute(
-    const PolygonBoxTransformParam<CPU> &param) {
-  PolygonBoxTransformCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/pool_kernel.cpp b/mobile/src/operators/kernel/arm/pool_kernel.cpp
deleted file mode 100644
index 703a73d64bc9726c477952e7a2cbfcf59be6c5fb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/pool_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-#include "operators/kernel/central-arm-func/pool_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<CPU, float>::Init(PoolParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PoolKernel<CPU, float>::Compute(const PoolParam<CPU> &param) {
-  PoolCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // POOL_OP
diff --git a/mobile/src/operators/kernel/arm/prelu_kernel.cpp b/mobile/src/operators/kernel/arm/prelu_kernel.cpp
deleted file mode 100644
index 591bd644165f1a271a879073b27429d1780cbfb5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/prelu_kernel.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRELU_OP
-
-#include "operators/kernel/prelu_kernel.h"
-#include <operators/math/transform.h>
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct PReluFunctor {
-  explicit PReluFunctor(float slope) { this->slope_ = slope; }
-  inline T operator()(T in) const { return in > 0 ? in : in * slope_; }
-
-  float slope_ = 0.0f;
-};
-
-/*
- * @b 特化到具体平台的实现, param 从 op 层传入
- * */
-template <>
-void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> &param) {
-  auto *x = param.InputX();
-  auto *alpha = param.InputAlpha();
-  auto *out = param.Out();
-  std::string mode = param.Mode();
-  auto *x_ptr = x->data<float>();
-  auto *o_ptr = out->mutable_data<float>();
-  auto *alpha_ptr = alpha->data<float>();
-  int numel = x->numel();
-  auto dim = x->dims();
-  int k = dim[0] * dim[1];
-  int n = dim[2] * dim[3];
-  int index = 0;
-  int i = 0;
-  int temp = 0;
-#if __ARM_NEON
-  #pragma omp parallel for
-  for (int i = 0; i < k; i++) {
-    float32x4_t zero = vdupq_n_f32(0.0);
-    float32x4_t cv;
-    float32x4_t cv1;
-    float32x4_t cv2;
-    float32x4_t pv;
-    for (int j = 0; (j + 3) < n; j += 4) {
-      const float *in = x_ptr + i * n + j;
-      float *out = o_ptr + i * n + j;
-      cv = vld1q_f32(in);
-      cv1 = vmaxq_f32(cv, zero);
-      cv2 = vminq_f32(cv, zero);
-      if (mode == "channel") {
-        cv2 = vmulq_n_f32(cv2, alpha_ptr[i]);
-      } else if (mode == "element") {
-        pv = vld1q_f32(alpha_ptr + i * n + j);
-        cv2 = vmulq_f32(cv2, pv);
-      } else {
-        cv2 = vmulq_n_f32(cv2, alpha_ptr[0]);
-      }
-      cv = vaddq_f32(cv1, cv2);
-      vst1q_f32(out, cv);
-    }
-    int j;
-    for (j = 0; (j + 3) < n; j += 4) {
-    }
-    for (int m = j; m < n; m++) {
-      if (mode == "channel") {
-        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
-                               ? x_ptr[i * n + m]
-                               : alpha_ptr[i] * x_ptr[i * n + m];
-      } else if (mode == "element") {
-        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
-                               ? x_ptr[i * n + m]
-                               : alpha_ptr[i * n + m] * x_ptr[i * n + m];
-      } else {
-        o_ptr[i * n + m] = x_ptr[i * n + m] > 0
-                               ? x_ptr[i * n + m]
-                               : alpha_ptr[0] * x_ptr[i * n + m];
-      }
-    }
-  }
-
-#else
-  if (mode == "channel") {
-    temp = numel / (dim[0] * dim[1]);
-#pragma omp parallel for
-    for (i = 0; i < numel; i++) {
-      index = (i / temp) % dim[1];
-      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
-    }
-  } else if (mode == "element") {
-#pragma omp parallel for
-    for (i = 0; i < numel; i++) {
-      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
-    }
-  } else {
-#pragma omp parallel for
-    for (i = 0; i < numel; i++) {
-      o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
-    }
-  }
-#endif
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp b/mobile/src/operators/kernel/arm/prior_box_kernel.cpp
deleted file mode 100644
index c067d3388dd928b032178add99c6567a8add20d3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/prior_box_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRIORBOX_OP
-
-#include "operators/kernel/prior_box_kernel.h"
-#include "operators/kernel/central-arm-func/prior_box_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PriorBoxKernel<CPU, float>::Init(PriorBoxParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PriorBoxKernel<CPU, float>::Compute(const PriorBoxParam<CPU> &param) {
-  PriorBoxCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/proposal_kernel.cpp b/mobile/src/operators/kernel/arm/proposal_kernel.cpp
deleted file mode 100644
index c9d0c18448c6b19effa7ca4d905eadfe03e6fbaf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/proposal_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PROPOSAL_OP
-
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ProposalKernel<CPU, float>::Init(ProposalParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ProposalKernel<CPU, float>::Compute(const ProposalParam<CPU> &param) {
-  // TODO(hjchen2)
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PROPOSAL_OP
diff --git a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
deleted file mode 100644
index 6ed4c77d2d7ab8ed31c8b409d54fd52036e60210..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/psroi_pool_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<CPU, float>::Init(PSRoiPoolParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void PSRoiPoolKernel<CPU, float>::Compute(const PSRoiPoolParam<CPU> &param) {
-  // TODO(hjchen2)
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
diff --git a/mobile/src/operators/kernel/arm/quantize_kernel.cpp b/mobile/src/operators/kernel/arm/quantize_kernel.cpp
deleted file mode 100644
index 515e9cf40dad4eedd307bc59309177910330a3bf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/quantize_kernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#include "operators/kernel/quantize_kernel.h"
-#include <cmath>
-#include "operators/math/quantize.h"
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#ifndef __aarch64__
-inline float32_t vmaxvq_f32(float32x4_t r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-#endif
-
-template <RoundType R>
-inline void QuantizeOffline(const Tensor *input, const float scale,
-                            const float max_abs, Tensor *output) {
-  const float *x = input->data<float>();
-  int8_t *y = output->mutable_data<int8_t>();
-  size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = remain >> 4;
-  remain = remain & 0xF;
-  float32x4_t __scale = vdupq_n_f32(scale);
-  float32x4_t __postive_max = vdupq_n_f32(max_abs);
-  float32x4_t __negtive_max = vdupq_n_f32(-max_abs);
-  #pragma omp parallel for
-  for (size_t i = 0; i < loop; ++i) {
-    const float *local_x = x + (i << 4);
-    int8_t *local_y = y + (i << 4);
-    float32x4_t r0 = vld1q_f32(local_x);
-    float32x4_t r1 = vld1q_f32(local_x + 4);
-    float32x4_t r2 = vld1q_f32(local_x + 8);
-    float32x4_t r3 = vld1q_f32(local_x + 12);
-    r0 = vmaxq_f32(vminq_f32(r0, __postive_max), __negtive_max);
-    r1 = vmaxq_f32(vminq_f32(r1, __postive_max), __negtive_max);
-    r2 = vmaxq_f32(vminq_f32(r2, __postive_max), __negtive_max);
-    r3 = vmaxq_f32(vminq_f32(r3, __postive_max), __negtive_max);
-    r0 = vmulq_f32(r0, __scale);
-    r1 = vmulq_f32(r1, __scale);
-    r2 = vmulq_f32(r2, __scale);
-    r3 = vmulq_f32(r3, __scale);
-    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-    int32x4_t q3 = math::vRoundq_f32<R>(r3);
-    int16x4_t d0 = vmovn_s32(q0);
-    int16x4_t d1 = vmovn_s32(q1);
-    int16x4_t d2 = vmovn_s32(q2);
-    int16x4_t d3 = vmovn_s32(q3);
-    int16x8_t q5 = vcombine_s16(d0, d1);
-    int16x8_t q6 = vcombine_s16(d2, d3);
-    int8x8_t d5 = vmovn_s16(q5);
-    int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(local_y, d5);
-    vst1_s8(local_y + 8, d6);
-  }
-  x += (loop << 4);
-  y += (loop << 4);
-#endif
-  for (size_t i = 0; i < remain; ++i) {
-    float x_temp = std::max(std::min(x[i], max_abs), -max_abs);
-    y[i] = math::Round<R>(x_temp * scale);
-  }
-}
-
-template <RoundType R>
-inline void QuantizeOnline(const Tensor *input, const float scale,
-                           Tensor *output) {
-  const float *x = input->data<float>();
-  int8_t *y = output->mutable_data<int8_t>();
-  size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = remain >> 4;
-  remain = remain & 0xF;
-  float32x4_t __scale = vdupq_n_f32(scale);
-  #pragma omp parallel for
-  for (size_t i = 0; i < loop; ++i) {
-    const float *local_x = x + (i << 4);
-    int8_t *local_y = y + (i << 4);
-    float32x4_t r0 = vld1q_f32(local_x);
-    float32x4_t r1 = vld1q_f32(local_x + 4);
-    float32x4_t r2 = vld1q_f32(local_x + 8);
-    float32x4_t r3 = vld1q_f32(local_x + 12);
-    r0 = vmulq_f32(r0, __scale);
-    r1 = vmulq_f32(r1, __scale);
-    r2 = vmulq_f32(r2, __scale);
-    r3 = vmulq_f32(r3, __scale);
-    int32x4_t q0 = math::vRoundq_f32<R>(r0);
-    int32x4_t q1 = math::vRoundq_f32<R>(r1);
-    int32x4_t q2 = math::vRoundq_f32<R>(r2);
-    int32x4_t q3 = math::vRoundq_f32<R>(r3);
-    int16x4_t d0 = vmovn_s32(q0);
-    int16x4_t d1 = vmovn_s32(q1);
-    int16x4_t d2 = vmovn_s32(q2);
-    int16x4_t d3 = vmovn_s32(q3);
-    int16x8_t q5 = vcombine_s16(d0, d1);
-    int16x8_t q6 = vcombine_s16(d2, d3);
-    int8x8_t d5 = vmovn_s16(q5);
-    int8x8_t d6 = vmovn_s16(q6);
-    vst1_s8(local_y, d5);
-    vst1_s8(local_y + 8, d6);
-  }
-  x += (loop << 4);
-  y += (loop << 4);
-#endif
-  for (size_t i = 0; i < remain; ++i) {
-    y[i] = math::Round<R>(x[i] * scale);
-  }
-}
-
-template <RoundType R>
-static void Quantize(const Tensor *input, const float max_abs,
-                     const bool offline, Tensor *output) {
-  float scale = 127.f / max_abs;
-  if (offline) {
-    QuantizeOffline<R>(input, scale, max_abs, output);
-  } else {
-    QuantizeOnline<R>(input, scale, output);
-  }
-}
-
-float find_abs_max(const Tensor *input) {
-  float max_abs = 0.f;
-  const float *x = input->data<float>();
-  size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  size_t loop = remain >> 4;
-  remain = remain & 0xF;
-  float32x4_t __max = {0.f, 0.f, 0.f, 0.f};
-
-  for (size_t i = 0; i < loop; ++i, x += 16) {
-    float32x4_t r0 = vld1q_f32(x);
-    float32x4_t r1 = vld1q_f32(x + 4);
-    float32x4_t r2 = vld1q_f32(x + 8);
-    float32x4_t r3 = vld1q_f32(x + 12);
-    r0 = vabsq_f32(r0);
-    r1 = vabsq_f32(r1);
-    r2 = vabsq_f32(r2);
-    r3 = vabsq_f32(r3);
-    r0 = vmaxq_f32(r0, r1);
-    r1 = vmaxq_f32(r2, r3);
-    r0 = vmaxq_f32(r0, r1);
-    __max = vmaxq_f32(r0, __max);
-  }
-  max_abs = vmaxvq_f32(__max);
-#endif
-  for (size_t i = 0; i < remain; ++i) {
-    max_abs = std::max(max_abs, static_cast<float>(fabs(x[i])));
-  }
-  return max_abs;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void QuantizeKernel<CPU, float>::Compute(const QuantizeParam<CPU> &param) {
-  const LoDTensor *input = param.input_;
-  LoDTensor *output = param.output_;
-  Tensor *output_scale = param.online_scale_;
-  float max_abs = 0.f;
-  if (param.offline_) {
-    max_abs = param.offline_scale_->data<float>()[0];
-  } else {
-    max_abs = find_abs_max(input);
-  }
-  max_abs = std::max(max_abs, 1e-6f);
-  param.online_scale_->mutable_data<float>()[0] = max_abs;
-  switch (param.round_type_) {
-    case ROUND_NEAREST_TO_EVEN:
-      Quantize<ROUND_NEAREST_TO_EVEN>(input, max_abs, param.offline_, output);
-      break;
-    case ROUND_NEAREST_TOWARDS_ZERO:
-      Quantize<ROUND_NEAREST_TOWARDS_ZERO>(input, max_abs, param.offline_,
-                                           output);
-      break;
-    case ROUND_NEAREST_AWAY_ZERO:
-      Quantize<ROUND_NEAREST_AWAY_ZERO>(input, max_abs, param.offline_, output);
-      break;
-    default:
-      LOG(kLOG_ERROR) << "round type is not supported.";
-      break;
-  }
-  output->set_lod(input->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp b/mobile/src/operators/kernel/arm/reshape2_kernel.cpp
deleted file mode 100644
index 093105f906da2287015417ec05b709aebd4a1fb2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/reshape2_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-#include "operators/kernel/central-arm-func/reshape2_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<CPU, float>::Init(Reshape2Param<CPU> *param) {
-  return true;
-}
-
-template <>
-void Reshape2Kernel<CPU, float>::Compute(const Reshape2Param<CPU> &param) {
-  Reshape2Compute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/reshape_kernel.cpp b/mobile/src/operators/kernel/arm/reshape_kernel.cpp
deleted file mode 100644
index 800808f9c23cd07d17f8207b9b51e96d3feb34f3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/reshape_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/kernel/central-arm-func/reshape_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<CPU, float>::Init(ReshapeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ReshapeKernel<CPU, float>::Compute(const ReshapeParam<CPU> &param) {
-  ReshapeCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/resize_kernel.cpp b/mobile/src/operators/kernel/arm/resize_kernel.cpp
deleted file mode 100644
index 6a6af367889f5abaed96f3d2c75196d1fa0b96d8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/resize_kernel.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#include "operators/kernel/resize_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-void BiLinearResizeTensor(const float* src, const int src_height,
-                          const int src_width, float* dst, const int dst_height,
-                          const int dst_width) {
-  const float scale_w = src_width / static_cast<float>(dst_width);
-  const float scale_h = src_height / static_cast<float>(dst_height);
-  float* dst_data = dst;
-  const float* src_data = src;
-
-  for (int dst_h = 0; dst_h < dst_height; ++dst_h) {
-    float fh = dst_h * scale_h;
-
-    int src_h = std::floor(fh);
-
-    fh -= src_h;
-    const float w_h0 = fabs(1.0 - fh);
-    const float w_h1 = fabs(fh);
-
-    const int dst_offset_1 = dst_h * dst_width;
-    const int src_offset_1 = src_h * src_width;
-
-    float* dst_data_ptr = dst_data + dst_offset_1;
-
-    for (int dst_w = 0; dst_w < dst_width; ++dst_w) {
-      float fw = dst_w * scale_w;
-      int src_w = std::floor(fw);
-      fw -= src_w;
-      const float w_w0 = fabs(1.0 - fw);
-      const float w_w1 = fabs(fw);
-
-      float dst_value = 0;
-
-      const int src_idx = src_offset_1 + src_w;
-      dst_value += (w_h0 * w_w0 * src_data[src_idx]);
-      int flag = 0;
-      if (src_w + 1 < src_width) {
-        dst_value += (w_h0 * w_w1 * src_data[src_idx + 1]);
-        ++flag;
-      }
-      if (src_h + 1 < src_height) {
-        dst_value += (w_h1 * w_w0 * src_data[src_idx + src_width]);
-        ++flag;
-      }
-
-      if (flag > 1) {
-        dst_value += (w_h1 * w_w1 * src_data[src_idx + src_width + 1]);
-        //                ++flag;
-      }
-      *(dst_data_ptr++) = dst_value;
-    }
-  }
-}
-
-void ResizeTensor(const Tensor* src, const int src_n, const int src_c,
-                  Tensor* dst, const int dst_n, const int dst_c) {
-  framework::DDim in_dims = src->dims();
-  const int src_chans = in_dims[1];
-  const int src_height = in_dims[2];
-  const int src_width = in_dims[3];
-  const int src_offset = (src_n * src_chans + src_c) * src_height * src_width;
-
-  framework::DDim out_dims = dst->dims();
-  const int dst_chans = out_dims[1];
-  const int dst_height = out_dims[2];
-  const int dst_width = out_dims[3];
-  const int dst_offset = (dst_n * dst_chans + dst_c) * dst_height * dst_width;
-
-  const auto* src_ptr = src->data<float>();
-  auto* dst_ptr = dst->data<float>();
-  const auto* src_data = &(src_ptr[src_offset]);
-  auto* dst_data = &(dst_ptr[dst_offset]);
-  BiLinearResizeTensor(src_data, src_height, src_width, dst_data, dst_height,
-                       dst_width);
-}
-
-void ResizeTensor(const Tensor* src, Tensor* dst) {
-  framework::DDim in_dims = src->dims();
-  framework::DDim out_dims = dst->dims();
-  PADDLE_MOBILE_ENFORCE(in_dims[0] == out_dims[0],
-                        "src tensor batch num not equal to dst tensor");
-  PADDLE_MOBILE_ENFORCE(in_dims[1] == out_dims[1],
-                        "src tensor channel num not equal to dst tensor");
-  for (int n = 0, batch_num = in_dims[0]; n < batch_num; ++n) {
-    for (int c = 0, chan_num = in_dims[1]; c < chan_num; ++c) {
-      ResizeTensor(src, n, c, dst, n, c);
-    }
-  }
-}
-
-template <>
-void ResizeKernel<CPU, float>::Compute(const ResizeParam<CPU>& param) {
-  const auto* input_x = param.InputX();
-  const auto& input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  framework::DDim out_dims = CalOutputShape(param);
-
-  out->Resize(out_dims);
-  ResizeTensor(input_x, out);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp b/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
deleted file mode 100644
index c8b0cb8bf2a90f4d8483808fee7a37b3223e7099..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/roi_perspective_kernel.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ROI_PERSPECTIVE_OP
-
-#include <cmath>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-inline bool GT_E(T a, T b) {
-  return (a > b) || fabs(a - b) < 1e-4;
-}
-
-template <typename T>
-inline bool LT_E(T a, T b) {
-  return (a < b) || fabs(a - b) < 1e-4;
-}
-
-// check if (x, y) is in the boundary of roi
-template <typename T>
-bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
-          GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
-        return true;
-      }
-    } else {
-      T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-      if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
-          LT_E<T>(y, std::max(ys, ye))) {
-        return true;
-      }
-    }
-  }
-
-  int n_cross = 0;
-  for (int i = 0; i < 4; i++) {
-    T xs = roi_x[i];
-    T ys = roi_y[i];
-    T xe = roi_x[(i + 1) % 4];
-    T ye = roi_y[(i + 1) % 4];
-    if (fabs(ys - ye) < 1e-4) {
-      continue;
-    }
-    if (LT_E<T>(y, std::min(ys, ye)) || (y > std::max(ys, ye))) {
-      continue;
-    }
-    T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
-    if (fabs(intersec_x - x) < 1e-4) {
-      return true;
-    }
-    if (intersec_x > x) {
-      n_cross++;
-    }
-  }
-  return (n_cross % 2 == 1);
-}
-
-template <typename T>
-void get_transform_matrix(const int transformed_width,
-                          const int transformed_height, T roi_x[], T roi_y[],
-                          T matrix[]) {
-  T x0 = roi_x[0];
-  T x1 = roi_x[1];
-  T x2 = roi_x[2];
-  T x3 = roi_x[3];
-  T y0 = roi_y[0];
-  T y1 = roi_y[1];
-  T y2 = roi_y[2];
-  T y3 = roi_y[3];
-
-  // Estimate the height and width of RoI
-  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
-  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
-  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
-  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
-  T estimated_height = (len2 + len4) / 2.0;
-  T estimated_width = (len1 + len3) / 2.0;
-
-  // Get the normalized height and normalized width
-  int normalized_height = transformed_height;
-  int normalized_width =
-      std::round(estimated_width * (normalized_height - 1) / estimated_height) +
-      1;
-  normalized_width = std::min(normalized_width, transformed_width);
-
-  T dx1 = x1 - x2;
-  T dx2 = x3 - x2;
-  T dx3 = x0 - x1 + x2 - x3;
-  T dy1 = y1 - y2;
-  T dy2 = y3 - y2;
-  T dy3 = y0 - y1 + y2 - y3;
-
-  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
-              (normalized_width - 1);
-  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
-              (normalized_height - 1);
-  matrix[8] = 1;
-
-  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
-              (normalized_width - 1);
-  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
-              (normalized_height - 1);
-  matrix[5] = y0;
-
-  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
-              (normalized_width - 1);
-  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
-              (normalized_height - 1);
-  matrix[2] = x0;
-}
-
-// Get the source coordinates in the input feature map.
-// (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
-// in_w = u / w
-// in_h = v / w
-template <typename T>
-void get_source_coords(T matrix[], int out_w, int out_h, T *in_w, T *in_h) {
-  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
-  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
-  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
-
-  in_w[0] = u / w;
-  in_h[0] = v / w;
-}
-
-template <typename T>
-void bilinear_interpolate(const T *in_data, const int channels, const int width,
-                          const int height, int in_n, int in_c, T in_w, T in_h,
-                          T *val) {
-  // Deal with cases that source coords are out of feature map boundary
-  if ((-0.5 > in_w) || (in_w > width - 0.5) || (-0.5 > in_h) ||
-      (in_h > height - 0.5)) {
-    // empty
-    val[0] = 0.0;
-    return;
-  }
-
-  if (in_w < 0) {
-    in_w = 0;
-  }
-  if (in_h < 0) {
-    in_h = 0;
-  }
-
-  int in_w_floor = floor(in_w);
-  int in_h_floor = floor(in_h);
-  int in_w_ceil;
-  int in_h_ceil;
-
-  if (GT_E<T>(in_w_floor, width - 1)) {
-    in_w_ceil = in_w_floor = width - 1;
-    in_w = static_cast<T>(in_w_floor);
-  } else {
-    in_w_ceil = in_w_floor + 1;
-  }
-
-  if (GT_E<T>(in_h_floor, height - 1)) {
-    in_h_ceil = in_h_floor = height - 1;
-    in_h = static_cast<T>(in_h_floor);
-  } else {
-    in_h_ceil = in_h_floor + 1;
-  }
-  T w_floor = in_w - in_w_floor;
-  T h_floor = in_h - in_h_floor;
-  T w_ceil = 1 - w_floor;
-  T h_ceil = 1 - h_floor;
-  const T *data = in_data + (in_n * channels + in_c) * height * width;
-  // Do bilinear interpolation
-  T v1 = data[in_h_floor * width + in_w_floor];
-  T v2 = data[in_h_ceil * width + in_w_floor];
-  T v3 = data[in_h_ceil * width + in_w_ceil];
-  T v4 = data[in_h_floor * width + in_w_ceil];
-  T w1 = w_ceil * h_ceil;
-  T w2 = w_ceil * h_floor;
-  T w3 = w_floor * h_floor;
-  T w4 = w_floor * h_ceil;
-  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
-}
-
-template <>
-bool RoiPerspectiveKernel<CPU, float>::Init(RoiPerspectiveParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void RoiPerspectiveKernel<CPU, float>::Compute(
-    const RoiPerspectiveParam<CPU> &param) {
-  const auto *input_x = param.input_x_;
-  const auto *input_rois = param.input_rois_;
-  auto *output = param.output_;
-  auto *transform_Matrix = param.transform_Matrix_;
-  auto *mask = param.mask;
-
-  const auto &in_dims = input_x->dims();
-  const int channels = in_dims[1];
-  const int in_height = in_dims[2];
-  const int in_width = in_dims[3];
-  const int rois_num = input_rois->dims()[0];
-  const int transformed_height = param.transformed_height_;
-  const int transformed_width = param.transformed_width_;
-  const float spatial_scale = param.spatial_scale_;
-
-  const float *input_data = input_x->data<float>();
-  const float *rois_data = input_rois->data<float>();
-  float *output_data = output->mutable_data<float>();
-  int *mask_data = mask->mutable_data<int>();
-  float *transform_matrix =
-      transform_Matrix->mutable_data<float>({rois_num, 9});
-
-  std::vector<int> roi2image(rois_num);
-  const auto &lod = input_rois->lod().back();
-  for (size_t i = 0; i < lod.size() - 1; ++i) {
-    for (size_t j = lod[i]; j < lod[i + 1]; ++j) {
-      roi2image[j] = i;
-    }
-  }
-
-  for (int n = 0; n < rois_num; ++n) {
-    const float *n_rois = rois_data + n * 8;
-    float roi_x[4];
-    float roi_y[4];
-    for (int k = 0; k < 4; ++k) {
-      roi_x[k] = n_rois[2 * k] * spatial_scale;
-      roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
-    }
-    int image_id = roi2image[n];
-    // Get transform matrix
-    //    float transform_matrix[9];
-    float matrix[9];
-    get_transform_matrix<float>(transformed_width, transformed_height, roi_x,
-                                roi_y, matrix);
-    for (int i = 0; i < 9; i++) {
-      transform_matrix[n * 9 + i] = matrix[i];
-    }
-    for (int c = 0; c < channels; ++c) {
-      for (int out_h = 0; out_h < transformed_height; ++out_h) {
-        for (int out_w = 0; out_w < transformed_width; ++out_w) {
-          int out_index =
-              n * channels * transformed_height * transformed_width +
-              c * transformed_height * transformed_width +
-              out_h * transformed_width + out_w;
-          float in_w, in_h;
-          get_source_coords<float>(matrix, out_w, out_h, &in_w, &in_h);
-          if (in_quad<float>(in_w, in_h, roi_x, roi_y)) {
-            if ((-0.5 > in_w) || (in_w > (in_width - 0.5)) || (-0.5 > in_h) ||
-                (in_h > (in_height - 0.5))) {
-              output_data[out_index] = 0.0;
-              mask_data[(n * transformed_height + out_h) * transformed_width +
-                        out_w] = 0;
-            } else {
-              bilinear_interpolate<float>(input_data, channels, in_width,
-                                          in_height, image_id, c, in_w, in_h,
-                                          output_data + out_index);
-              mask_data[(n * transformed_height + out_h) * transformed_width +
-                        out_w] = 1;
-            }
-          } else {
-            output_data[out_index] = 0.0;
-            mask_data[(n * transformed_height + out_h) * transformed_width +
-                      out_w] = 1;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ROI_PERSPECTIVE_OP
diff --git a/mobile/src/operators/kernel/arm/scale_kernel.cpp b/mobile/src/operators/kernel/arm/scale_kernel.cpp
deleted file mode 100644
index fffcb07533490e490e0e62044d477c8ff54089d5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/scale_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "operators/kernel/scale_kernel.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ScaleKernel<CPU, float>::Init(ScaleParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ScaleKernel<CPU, float>::Compute(const ScaleParam<CPU> &param) {
-  const auto input = param.InputX();
-  auto output = param.Out();
-  if (input->dims() != output->dims()) {
-    output->Resize(input->dims());
-  }
-  const float scale = param.Scale();
-  const float bias = param.Bias();
-  if (input->type() == type_id<int64_t>().hash_code()) {
-    const int64_t *input_data = input->data<int64_t>();
-    int64_t *output_data = output->mutable_data<int64_t>();
-
-    int i = 0;
-    for (; i < output->numel(); ++i, ++output_data, ++input_data) {
-      *output_data = scale * (*input_data) + bias;
-    }
-  } else {
-    const float *input_data = input->data<float>();
-    float *output_data = output->mutable_data<float>();
-
-    int i = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    float32x4_t vscale = vdupq_n_f32(scale);
-    float32x4_t vbias = vdupq_n_f32(bias);
-    for (; i < output->numel() - 15; i += 16) {
-      float32x4_t _in0 = vld1q_f32(input_data);
-      float32x4_t _in1 = vld1q_f32(input_data + 4);
-      float32x4_t _in2 = vld1q_f32(input_data + 8);
-      float32x4_t _in3 = vld1q_f32(input_data + 12);
-      _in0 = vmlaq_f32(vbias, vscale, _in0);
-      _in1 = vmlaq_f32(vbias, vscale, _in1);
-      _in2 = vmlaq_f32(vbias, vscale, _in2);
-      _in3 = vmlaq_f32(vbias, vscale, _in3);
-      vst1q_f32(output_data, _in0);
-      vst1q_f32(output_data + 4, _in1);
-      vst1q_f32(output_data + 8, _in2);
-      vst1q_f32(output_data + 12, _in3);
-      input_data += 16;
-      output_data += 16;
-    }
-    for (; i < output->numel() - 3; i += 4) {
-      float32x4_t _in0 = vld1q_f32(input_data);
-      _in0 = vmlaq_f32(vbias, vscale, _in0);
-      vst1q_f32(output_data, _in0);
-      input_data += 4;
-      output_data += 4;
-    }
-#endif
-    for (; i < output->numel(); ++i, ++output_data, ++input_data) {
-      *output_data = scale * (*input_data) + bias;
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
deleted file mode 100644
index 82941ff0d566d00aeab404ef96f9b4550d92bb14..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/sequence_expand_kernel.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_EXPAND_OP
-
-#include <vector>
-#include "operators/kernel/sequence_kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-typedef int (*LoDElementFunctor)(const std::vector<size_t> &x_lod, int index);
-
-int element_with_lod(const std::vector<size_t> &x_lod, int index) {
-  return x_lod[index];
-}
-
-int element_without_lod(const std::vector<size_t> &x_lod, int index) {
-  return index;
-}
-
-template <typename T>
-inline void SequenceExpandImpl(const framework::LoDTensor &x,
-                               const std::vector<size_t> &ref_lod,
-                               framework::LoDTensor *output) {
-  const T *x_data = x.data<T>();
-  auto &x_lod = x.lod();
-  LoDElementFunctor lod_element = element_without_lod;
-  if (x_lod.size() == 1) lod_element = element_with_lod;
-
-  T *output_data = output->mutable_data<T>();
-  int x_item_length = x.numel() / x.dims()[0];
-  int out_offset = 0;
-
-  for (size_t i = 1; i < ref_lod.size(); ++i) {
-    int repeat_num = ref_lod[i] - ref_lod[i - 1];
-    int x_start = lod_element(x_lod[0], i - 1);
-    int x_end = lod_element(x_lod[0], i);
-    int x_seq_len = x_end - x_start;
-    if (repeat_num > 0) {
-      int out_start = out_offset;
-      if (output->lod().size() == 1) {
-        out_start = output->lod()[0][out_offset];
-      }
-      for (int j = 0; j < repeat_num; j++) {
-        for (int k = 0; k < x_seq_len; k++) {
-          memcpy(output_data + (out_start + j * x_seq_len + k) * x_item_length,
-                 x_data + (x_start + k) * x_item_length,
-                 x_item_length * sizeof(T));
-        }
-      }
-    }
-    out_offset += repeat_num;
-  }
-}
-
-template <typename T>
-class SequenceExpandKernel<CPU, T>
-    : public framework::OpKernelBase<CPU, SequenceExpandParam<CPU>> {
- public:
-  bool Init(SequenceExpandParam<CPU> *param) { return true; }
-
-  void Compute(const SequenceExpandParam<CPU> &param) {
-    const framework::LoDTensor *input_x = param.input_x_;
-    const framework::LoDTensor *input_y = param.input_y_;
-    framework::LoDTensor *output = param.output_;
-    output->mutable_data<T>();
-
-    const auto &x_lod = input_x->lod();
-    const auto &y_lod = input_y->lod();
-    int ref_level = param.ref_level_;
-    if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-    if (y_lod[ref_level].size() <= 1) {
-      framework::TensorCopy(*input_x, output);
-      output->set_lod(input_x->lod());
-      return;
-    }
-
-    std::vector<size_t> out_lod;
-    if (x_lod.size() == 1) {
-      out_lod.push_back(0);
-      for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-        int repeat_num = y_lod[ref_level][i] - y_lod[ref_level][i - 1];
-        int x_start = x_lod[0][i - 1];
-        int x_end = x_lod[0][i];
-        int x_seq_len = x_end - x_start;
-        for (int j = 0; j < repeat_num; ++j) {
-          out_lod.push_back(out_lod.back() + x_seq_len);
-        }
-      }
-      output->set_lod({out_lod});
-    }
-    SequenceExpandImpl<T>(*input_x, y_lod[ref_level], output);
-  }
-};
-
-template class SequenceExpandKernel<CPU, float>;
-// template class SequenceExpandKernel<CPU, int64_t>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_EXPAND_OP
diff --git a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
deleted file mode 100644
index db1939d4d0b701c73ab41c4e94e94a39996022c8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/sequence_pool_kernel.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_POOL_OP
-
-#include <cmath>
-#include <limits>
-#include <string>
-#include <vector>
-#include "common/types.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/math/pooling.h"
-#ifdef __ARM_NEON__
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <PoolingType P = MAX, typename T = float>
-void SequencePoolImpl(const framework::LoDTensor &input,
-                      framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  #pragma omp parallel for
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float *in_ptr = input_ptr + lod[i] * width;
-    float *out_ptr = output_ptr + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (width == 1) {
-      float max = -std::numeric_limits<float>::max();
-      int remain_h = height;
-#ifdef __ARM_NEON__
-      int loop = remain_h >> 2;
-      remain_h = remain_h & 0x3;
-      float32x4_t __max4 = math::vPoolInitq_f32<MAX>();
-      for (int h = 0; h < loop; ++h) {
-        float32x4_t r0 = vld1q_f32(in_ptr);
-        __max4 = vmaxq_f32(__max4, r0);
-        in_ptr += 4;
-      }
-      float32x2_t __max2 =
-          vpmax_f32(vget_low_f32(__max4), vget_high_f32(__max4));
-      __max2 = vpmax_f32(__max2, __max2);
-      max = std::max(max, vget_lane_f32(__max2, 0));
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-        max = std::max(max, in_ptr[h]);
-      }
-      *out_ptr = max;
-    } else {
-      memcpy(out_ptr, in_ptr, width * sizeof(float));
-      in_ptr += width;
-      int remain_h = height - 1;
-      int remain_w_start = 0;
-#ifdef __ARM_NEON__
-      remain_w_start = width & 0xfffffffc;
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
-        for (int w = 0; w < width; w += 4) {
-          float32x4_t __in = vld1q_f32(in_ptr + w);
-          float32x4_t __out = vld1q_f32(out_ptr + w);
-          __out = vmaxq_f32(__out, __in);
-          vst1q_f32(out_ptr + w, __out);
-        }
-#endif  // __ARM_NEON__
-        for (int w = remain_w_start; w < width; ++w) {
-          out_ptr[w] = std::max(out_ptr[w], in_ptr[w]);
-        }
-        in_ptr += width;
-      }
-    }
-  }
-}
-
-template <>
-void SequencePoolImpl<SUM, float>(const framework::LoDTensor &input,
-                                  framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  #pragma omp parallel for
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float *in_ptr = input_ptr + lod[i] * width;
-    float *out_ptr = output_ptr + i * width;
-    int64_t height = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    if (width == 1) {
-      float sum = 0.f;
-      int remain_h = height;
-#ifdef __ARM_NEON__
-      int loop = remain_h >> 2;
-      remain_h = remain_h & 0x3;
-      float32x4_t __sum4 = vdupq_n_f32(0.f);
-      for (int h = 0; h < loop; ++h) {
-        float32x4_t r0 = vld1q_f32(in_ptr);
-        __sum4 = vaddq_f32(__sum4, r0);
-        in_ptr += 4;
-      }
-      float32x2_t __sum2 =
-          vpadd_f32(vget_low_f32(__sum4), vget_high_f32(__sum4));
-      sum += vget_lane_f32(__sum2, 0) + vget_lane_f32(__sum2, 1);
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-        sum += in_ptr[h];
-      }
-      *out_ptr = sum;
-    } else {
-      memcpy(out_ptr, in_ptr, width * sizeof(float));
-      in_ptr += width;
-      int remain_h = height - 1;
-      int remain_w_start = 0;
-#ifdef __ARM_NEON__
-      int loop_w = width >> 2;
-      remain_w_start = width & 0xfffffffc;
-#endif  // __ARM_NEON__
-      for (int h = 0; h < remain_h; ++h) {
-#ifdef __ARM_NEON__
-        for (int w = 0; w < width - 3; w += 4) {
-          float32x4_t __in = vld1q_f32(in_ptr + w);
-          float32x4_t __out = vld1q_f32(out_ptr + w);
-          __out = vaddq_f32(__out, __in);
-          vst1q_f32(out_ptr + w, __out);
-        }
-#endif  // __ARM_NEON__
-        for (int w = remain_w_start; w < width; ++w) {
-          out_ptr[w] += in_ptr[w];
-        }
-        in_ptr += width;
-      }
-    }
-  }
-}
-
-template <>
-void SequencePoolImpl<FIRST, float>(const framework::LoDTensor &input,
-                                    framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    const float *in_ptr = input_ptr + lod[i] * width;
-    float *out_ptr = output_ptr + i * width;
-    memcpy(out_ptr, in_ptr, width * sizeof(float));
-  }
-}
-
-template <>
-void SequencePoolImpl<LAST, float>(const framework::LoDTensor &input,
-                                   framework::LoDTensor *output) {
-  const float *input_ptr = input.data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  const auto &lod = input.lod()[0];
-  int64_t width = input.numel() / input.dims()[0];
-
-  for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-    int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
-    const float *in_ptr = input_ptr + seq_len * width;
-    float *out_ptr = output_ptr + i * width;
-    memcpy(out_ptr, in_ptr - width, width * sizeof(float));
-  }
-}
-
-template <typename T>
-class SequencePoolKernel<CPU, T>
-    : public framework::OpKernelBase<CPU, SequencePoolParam<CPU>> {
- public:
-  bool Init(SequencePoolParam<CPU> *param) { return true; }
-
-  void Compute(const SequencePoolParam<CPU> &param) {
-    const framework::LoDTensor *input = param.input_;
-    framework::LoDTensor *output = param.output_;
-    output->mutable_data<T>();
-    const std::string pooling_type = param.pool_type_;
-
-    if (param.pool_type_ == "MAX") {
-      SequencePoolImpl<MAX, T>(*input, output);
-    } else if (param.pool_type_ == "FIRST") {
-      SequencePoolImpl<FIRST, T>(*input, output);
-    } else if (param.pool_type_ == "LAST") {
-      SequencePoolImpl<LAST, T>(*input, output);
-    } else if (param.pool_type_ == "SUM") {
-      SequencePoolImpl<SUM, T>(*input, output);
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "pooling type `%s` has not been implemented.",
-          param.pool_type_.c_str());
-    }
-  }
-};
-
-template class SequencePoolKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_POOL_OP
diff --git a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp b/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
deleted file mode 100644
index b0df21fac560e67f1e1dfe4b42491ee84a384152..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/sequence_softmax_kernel.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_SOFTMAX_OP
-
-#include "framework/lod_tensor.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/math/softmax.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-class SequenceSoftmaxKernel<CPU, T>
-    : public framework::OpKernelBase<CPU, SoftmaxParam<CPU>> {
- public:
-  bool Init(SoftmaxParam<CPU> *param) { return true; }
-
-  void Compute(const SoftmaxParam<CPU> &param) {
-    param.Out()->mutable_data<float>();
-    const framework::LoDTensor *input = param.InputX();
-    framework::LoDTensor *output = param.Out();
-    math::SequenceSoftmaxFuntor<CPU, T> sequence_softmax;
-    sequence_softmax(input, output);
-  }
-};
-
-template class SequenceSoftmaxKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_SOFTMAX_OP
diff --git a/mobile/src/operators/kernel/arm/shape_kernel.cpp b/mobile/src/operators/kernel/arm/shape_kernel.cpp
deleted file mode 100644
index 4adbf8fa1321c57330b480068ff1f7df7454d7e6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/shape_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#include "operators/kernel/shape_kernel.h"
-#include "operators/kernel/central-arm-func/shape_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ShapeKernel<CPU, float>::Init(ShapeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ShapeKernel<CPU, float>::Compute(const ShapeParam<CPU> &param) {
-  ShapeCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/slice_kernel.cpp b/mobile/src/operators/kernel/arm/slice_kernel.cpp
deleted file mode 100644
index aeb18c8d2074ff1b32e4892fb0ac2dd20aa5047c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/slice_kernel.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/kernel/slice_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-void SliceCompute(const SliceParam<CPU>& param) {
-  auto input = param.input_;
-  auto output = param.output_;
-  auto* input_ptr = input->data<Dtype>();
-  auto* output_ptr = output->mutable_data<Dtype>();
-  auto out_dims = output->dims();
-  auto in_dims = input->dims();
-  auto starts = param.starts_;
-  auto ends = param.ends_;
-  int axes = param.axes_[0];
-  int HW = 1;
-  if (in_dims.size() >= 2 && axes <= in_dims.size() - 2) {
-    HW = in_dims[axes + 1] * input->dims()[axes + 2];
-  }
-  int batch_size = (out_dims.size() == 1) ? 1 : out_dims[axes - 1];
-  int input_channel = in_dims[axes];
-  int output_channel = out_dims[axes];
-
-  for (int c1 = 0; c1 < batch_size; ++c1) {
-    for (int c2 = starts[0], c3 = 0; c2 < ends[0]; ++c2, ++c3) {
-      size_t out_offset = c1 * output_channel * HW + c3 * HW;
-      size_t in_offset = c1 * input_channel * HW + c2 * HW;
-      memcpy(output_ptr + out_offset, input_ptr + in_offset,
-             HW * sizeof(float));
-    }
-  }
-}
-
-template <>
-bool SliceKernel<CPU, float>::Init(SliceParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void SliceKernel<CPU, float>::Compute(const SliceParam<CPU>& param) {
-  int rank = param.input_->dims().size();
-  switch (rank) {
-    case 1:
-      if (param.input_->type() == type_id<int>().hash_code()) {
-        SliceCompute<int>(param);
-      } else if (param.input_->type() == type_id<float>().hash_code()) {
-        SliceCompute<float>(param);
-      }
-      break;
-    case 2:
-      SliceCompute<float>(param);
-      break;
-    case 4:
-      SliceCompute<float>(param);
-      break;
-    case 5:
-      if (param.input_->dims()[0] == 1) {
-        SliceCompute<float>(param);
-      }
-      break;
-    default:
-      PADDLE_MOBILE_ENFORCE(0, "input dims not support now");
-      break;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/arm/softmax_kernel.cpp b/mobile/src/operators/kernel/arm/softmax_kernel.cpp
deleted file mode 100644
index bdb05656d44fa8b5cc61e3eda0eb7f2759f826c4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/softmax_kernel.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "../softmax_kernel.h"
-#include "../central-arm-func/softmax_arm_func.h"
-#include "operators/math/softmax.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<CPU, float>::Init(SoftmaxParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SoftmaxKernel<CPU, float>::Compute(const SoftmaxParam<CPU> &param) {
-  SoftmaxCompute<float>(param);
-  param.Out()->set_lod(param.InputX()->lod());
-}
-
-template class SoftmaxKernel<CPU, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/split_kernel.cpp b/mobile/src/operators/kernel/arm/split_kernel.cpp
deleted file mode 100644
index 13c7567e3db137f0c579ad0e33b1856aaf8334f2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/split_kernel.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-#include "operators/kernel/central-arm-func/split_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SplitKernel<CPU, float>::Init(SplitParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SplitKernel<CPU, float>::Compute(const SplitParam<CPU> &param) {
-  SplitCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/sum_kernel.cpp b/mobile/src/operators/kernel/arm/sum_kernel.cpp
deleted file mode 100644
index 2b36a382a1681b08e5f6c87b9031492e81a579cd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/sum_kernel.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#include "operators/kernel/sum_kernel.h"
-#include "operators/kernel/central-arm-func/sum_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SumKernel<CPU, float>::Init(SumParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void SumKernel<CPU, float>::Compute(const SumParam<CPU> &param) {
-  SumCompute<float>(param);
-  param.Out()->set_lod(param.Inputs()[0]->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp b/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
deleted file mode 100644
index bdf10574a82c920edfdfc870efe2f175ad7d7d07..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/tensor_array_read_write_kernel.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/tensor_array_read_write_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-template <>
-bool WriteToArrayKernel<CPU, float>::Init(WriteToArrayParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void WriteToArrayKernel<CPU, float>::Compute(
-    const WriteToArrayParam<CPU> &param) {
-  int64_t offset = param.index_->data<int64_t>()[0];
-  if (offset >= param.output_->size()) {
-    while (param.output_->size() <= offset) {
-      param.output_->emplace_back();
-    }
-  }
-
-  framework::LoDTensor *out_tensor = &(param.output_->at(offset));
-  out_tensor->set_lod(param.input_->lod());
-  if (param.input_->memory_size() > 0) {
-    TensorCopy(*(param.input_), out_tensor);
-  }
-}
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-template <>
-bool ReadFromArrayKernel<CPU, float>::Init(ReadFromArrayParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void ReadFromArrayKernel<CPU, float>::Compute(
-    const ReadFromArrayParam<CPU> &param) {
-  int64_t offset = param.index_->data<int64_t>()[0];
-  if (offset < param.input_->size()) {
-    TensorCopy(param.input_->at(offset), param.output_);
-    param.output_->set_lod(param.input_->at(offset).lod());
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(
-        "Can not read tensor which index is `%d` since it only has `%d` inputs",
-        offset, param.input_->size());
-  }
-}
-#endif  // READ_FROM_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/arm/top_k_kernel.cpp b/mobile/src/operators/kernel/arm/top_k_kernel.cpp
deleted file mode 100644
index 54a4f5b1a91bada7c00afcc064781b88ba0dd038..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/top_k_kernel.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TOP_K_OP
-
-#include <algorithm>
-#include <iostream>
-#include <vector>
-#include "operators/kernel/kernels.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TopKKernel<CPU, float>::Init(TopKParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void TopKKernel<CPU, float>::Compute(const TopKParam<CPU> &param) {
-  const Tensor *input = param.input_;
-  Tensor *output = param.output_;
-  Tensor *indices = param.indices_;
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  int64_t *indices_data = indices->mutable_data<int64_t>();
-
-  framework::DDim input_dims = input->dims();
-  const size_t row = framework::product(
-      framework::slice_ddim(input_dims, 0, input_dims.size() - 1));
-  const size_t col = input_dims[input_dims.size() - 1];
-
-  #pragma omp parallel for
-  for (size_t i = 0; i < row; i++) {
-    std::vector<std::pair<float, size_t>> vec(col);
-    const float *input_ptr = input_data + i * col;
-    float *output_ptr = output_data + i * param.k_;
-    int64_t *indices_ptr = indices_data + i * param.k_;
-
-    for (size_t j = 0; j < col; j++) {
-      vec[j] = std::move(std::pair<float, size_t>(input_ptr[j], j));
-    }
-    std::partial_sort(
-        vec.begin(), vec.begin() + param.k_, vec.end(),
-        [](const std::pair<float, size_t> &l,
-           const std::pair<float, size_t> &r) { return l.first > r.first; });
-    for (int j = 0; j < param.k_; ++j) {
-      output_ptr[j] = vec[j].first;
-      indices_ptr[j] = static_cast<int64_t>(vec[j].second);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // TOP_K_OP
diff --git a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp b/mobile/src/operators/kernel/arm/transpose2_kernel.cpp
deleted file mode 100644
index 54c88015cb06728544258ddb47d6ce751130bc19..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/transpose2_kernel.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-bool IsShuffleChannel(const std::vector<int> &axis) {
-  bool is_shuffle_channel = true;
-  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
-    for (int i = 3; i < axis.size(); ++i) {
-      if (axis[i] != i) {
-        is_shuffle_channel = false;
-        break;
-      }
-    }
-  } else {
-    return false;
-  }
-  return is_shuffle_channel;
-}
-
-template <typename Dtype>
-void ShuffleChannelCompute(const Transpose2Param<CPU> &param) {
-  const std::vector<int> &axis = param.Axis();
-  const Tensor *input = param.InputX();
-  const Dtype *input_ptr = input->data<Dtype>();
-  Tensor *output = param.Out();
-  Dtype *output_ptr = output->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-  size_t offset = 1;
-  for (int i = 3; i < axis.size(); ++i) {
-    offset *= in_dim[i];
-  }
-
-  #pragma omp parallel for collapse(3)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
-      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
-        size_t out_offset =
-            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
-        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
-        memcpy(output_ptr + out_offset, input_ptr + in_offset,
-               offset * sizeof(Dtype));
-      }
-    }
-  }
-}
-
-template <typename Dtype>
-void Transpose2Compute(const Transpose2Param<CPU> &param) {
-  const std::vector<int> &axis = param.Axis();
-  const Tensor *input = param.InputX();
-  const Dtype *input_ptr = input->data<Dtype>();
-  Tensor *output = param.Out();
-  Dtype *output_ptr = output->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-
-  // precompute inverted output dim and strides
-  size_t rout_dim[6], strides[6];
-  int permute = axis.size();  // permute must >=2 && <= 6.
-  for (int i = 0; i < permute; ++i) {
-    int k = permute - 1 - i;
-    strides[k] = 1;
-    for (int j = axis[i] + 1; j < permute; ++j) {
-      strides[k] *= in_dim[j];
-    }
-    rout_dim[k] = out_dim[i];
-  }
-  // unroll the first 2 dimensions
-  int reamin_dim = 1;
-  for (int i = 2; i < out_dim.size(); ++i) {
-    reamin_dim *= out_dim[i];
-  }
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int j = 0; j < out_dim[1]; ++j) {
-      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
-      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
-      int indics[4] = {0, 0, 0, 0};
-      for (int k = 0; k < reamin_dim; ++k) {
-        out_ptr[k] = input_ptr[offset];
-        indics[0] += 1;
-        offset += strides[0];
-        for (int p = 0; p < permute - 3; ++p) {
-          if (indics[p] == rout_dim[p]) {
-            indics[p + 1] += 1;
-            indics[p] = 0;
-            offset += strides[p + 1];
-            offset -= rout_dim[p] * strides[p];
-          } else {
-            break;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <>
-bool Transpose2Kernel<CPU, float>::Init(Transpose2Param<CPU> *param) {
-  return true;
-}
-
-template <>
-void Transpose2Kernel<CPU, float>::Compute(const Transpose2Param<CPU> &param) {
-  const std::vector<int> &axis = param.Axis();
-  bool shuffle_channel = IsShuffleChannel(axis);
-  if (shuffle_channel) {
-    if (param.InputX()->type() == type_id<int8_t>().hash_code()) {
-      ShuffleChannelCompute<int8_t>(param);
-    } else {
-      ShuffleChannelCompute<float>(param);
-    }
-  } else {
-    if (param.InputX()->type() == type_id<int8_t>().hash_code()) {
-      Transpose2Compute<int8_t>(param);
-    } else {
-      Transpose2Compute<float>(param);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // TRANSPOSE2_OP
diff --git a/mobile/src/operators/kernel/arm/transpose_kernel.cpp b/mobile/src/operators/kernel/arm/transpose_kernel.cpp
deleted file mode 100644
index f90376eb507253badb209838a3db4bafbcfbb5b9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/transpose_kernel.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE_OP
-
-#include "operators/kernel/transpose_kernel.h"
-#include "operators/kernel/central-arm-func/transpose_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TransposeKernel<CPU, float>::Init(TransposeParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void TransposeKernel<CPU, float>::Compute(const TransposeParam<CPU> &param) {
-  TransposeCompute<float>(param);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/arm/while_kernel.cpp b/mobile/src/operators/kernel/arm/while_kernel.cpp
deleted file mode 100644
index 43e88aad4da302e9221cfc51dad1a15544120d94..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/arm/while_kernel.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef WHILE_OP
-
-#include "operators/kernel/while_kernel.h"
-#include "framework/loader.h"
-#include "framework/lod_tensor.h"
-#include "framework/op_registry.h"
-#include "framework/operator.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-class WhileStepExecutor {
-  typedef std::shared_ptr<framework::OperatorBase<CPU>> OperatorPtr;
-
- public:
-  WhileStepExecutor(const framework::BlockDesc *block, framework::Scope *scope)
-      : scope_(scope) {
-    std::vector<std::shared_ptr<framework::OpDesc>> ops = block->Ops();
-    ops_of_block_.resize(ops.size());
-    for (int i = 0; i < ops.size(); ++i) {
-      std::shared_ptr<framework::OpDesc> op_desc = ops[i];
-      DLOG << "while kernel create op: " << op_desc->Type();
-      auto op_handler = framework::OpRegistry<CPU>::CreateOp(
-          op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
-          op_desc->GetAttrMap(), scope_);
-      op_handler->Init();
-      ops_of_block_[i] = op_handler;
-    }
-  }
-
-  void Run() {
-    for (int i = 0; i < ops_of_block_.size(); ++i) {
-      auto &op_handler = ops_of_block_[i];
-      DLOG << "while kernel InferShape op: " << i
-           << "th : " << op_handler->Type();
-      op_handler->InferShape();
-      DLOG << "while kernel Run op: " << i << "th : " << op_handler->Type();
-      op_handler->Run();
-    }
-  }
-
-  void CreateVariables(Scope &scope, const WhileParam<CPU> &param) {
-    for (const auto &var_desc : param.sub_block_->Vars()) {
-      auto var = scope.Var(var_desc->Name());
-      if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-        if (var_desc->Persistable()) {
-          auto dim = var_desc->Tensor_desc().Dims();
-          auto tensor = var->framework::Variable::GetMutable<LoDTensor>();
-          tensor->Resize(framework::make_ddim(dim));
-        } else {
-          auto dim = var_desc->Tensor_desc().Dims();
-          if (dim.size() == 0) {
-            auto tensor = var->framework::Variable::GetMutable<LoDTensor>();
-            framework::DDim dDim = {0};
-            tensor->Resize(dDim);
-          } else {
-            for (auto &d : dim) {
-              if (d < 0) {
-                d *= -1;
-              }
-            }
-            auto tensor = var->framework::Variable::GetMutable<LoDTensor>();
-            tensor->Resize(framework::make_ddim(dim));
-          }
-        }
-      } else {
-        // TODO(codeWorm)
-      }
-    }
-  }
-
- private:
-  framework::Scope *scope_;
-  std::vector<OperatorPtr> ops_of_block_;
-};
-
-template <>
-bool WhileKernel<CPU, float>::Init(WhileParam<CPU> *param) {
-  return true;
-}
-
-template <>
-void WhileKernel<CPU, float>::Compute(const WhileParam<CPU> &param) {
-  DLOG << "WhileKernel Compute";
-  WhileStepExecutor executor(param.sub_block_, param.scope_);
-  auto &current_scope = param.scope_->NewScope();
-  executor.CreateVariables(current_scope, param);
-  while (param.cond_->data<bool>()[0]) {
-    if (param.is_test) {
-      for (auto &name : current_scope.LocalVarNames()) {
-        auto *var = current_scope.Var(name);
-        if (var->IsType<framework::LoDTensor>()) {
-          // Clear all lod information for all lod_tensors.
-          auto *t = var->GetMutable<framework::LoDTensor>();
-          framework::LoD empty_lod;
-          t->set_lod(empty_lod);
-        } else if (var->IsType<framework::LoDTensorArray>()) {
-          // Clear elements of all tensor arrays.
-          auto *t = var->GetMutable<framework::LoDTensorArray>();
-          t->clear();
-        } else {
-          // todo
-        }
-      }
-    }
-    executor.Run();
-  }
-  param.scope_->DeleteScope(&current_scope);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // WHILE_OP
diff --git a/mobile/src/operators/kernel/assign_kernel.h b/mobile/src/operators/kernel/assign_kernel.h
deleted file mode 100644
index 0d06bb752190b3f720725c2296890e5059402b87..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/assign_kernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class AssignParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  AssignParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::InputXFrom<GType>(inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Output() const { return output_; }
-
- private:
-  GType *input_;
-  GType *output_;
-};
-
-DECLARE_KERNEL(Assign, AssignParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/kernel/assign_value_kernel.h b/mobile/src/operators/kernel/assign_value_kernel.h
deleted file mode 100644
index 5fae9218760ca93b770a137959d2db00de1ad584..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/assign_value_kernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ASSIGN_VALUE_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class AssignValueParam : public OpParam {
- public:
-  AssignValueParam(const VariableNameMap &inputs,
-                   const VariableNameMap &outputs, const AttributeMap &attrs,
-                   Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    output_ = GET_VAR_AS_LOD_TENSOR("Out", outputs, *scope);
-    shape_ = OpParam::GetAttr<std::vector<int>>("shape", attrs);
-    fp32_values_ = OpParam::GetAttr<std::vector<float>>("fp32_values", attrs);
-    int32_values_ = OpParam::GetAttr<std::vector<int>>("int32_values", attrs);
-    dtype_ = OpParam::GetAttr<int>("dtype", attrs);
-  }
-
- public:
-  framework::LoDTensor *output_;
-  std::vector<int> shape_;
-  std::vector<float> fp32_values_;
-  std::vector<int> int32_values_;
-  int dtype_;
-};
-
-DECLARE_KERNEL(AssignValue, AssignValueParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ASSIGN_VALUE_OP
diff --git a/mobile/src/operators/kernel/batchnorm_kernel.h b/mobile/src/operators/kernel/batchnorm_kernel.h
deleted file mode 100644
index 1f2db456d360d6eb6c684fb98e3807b07cc89b92..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/batchnorm_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class BatchNormKernel
-    : public framework::OpKernelBase<DeviceType, BatchNormParam<DeviceType>> {
- public:
-  void Compute(const BatchNormParam<DeviceType> &param);
-  bool Init(BatchNormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/beam_search_decode_kernel.h b/mobile/src/operators/kernel/beam_search_decode_kernel.h
deleted file mode 100644
index 36cc7f9f2d1b62bc37e0683417f5f5adfe0edfcc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/beam_search_decode_kernel.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_DECODE_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class BeamSearchDecodeParam : public OpParam {
- public:
-  BeamSearchDecodeParam(const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    ids_ =
-        OpParam::GetVarValue<framework::LoDTensorArray>("Ids", inputs, *scope);
-    scores_ = OpParam::GetVarValue<framework::LoDTensorArray>("Scores", inputs,
-                                                              *scope);
-    sentence_ids_ = OpParam::GetVarValue<framework::LoDTensor>("SentenceIds",
-                                                               outputs, *scope);
-    sentence_scores_ = OpParam::GetVarValue<framework::LoDTensor>(
-        "SentenceScores", outputs, *scope);
-    beam_size_ = OpParam::GetAttr<int>("beam_size", attrs);
-    end_id_ = OpParam::GetAttr<int>("end_id", attrs);
-  }
-
- public:
-  framework::LoDTensorArray *ids_;
-  framework::LoDTensorArray *scores_;
-  framework::LoDTensor *sentence_ids_;
-  framework::LoDTensor *sentence_scores_;
-  int beam_size_;
-  int end_id_;
-};
-
-DECLARE_KERNEL(BeamSearchDecode, BeamSearchDecodeParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_DECODE_OP
diff --git a/mobile/src/operators/kernel/beam_search_kernel.h b/mobile/src/operators/kernel/beam_search_kernel.h
deleted file mode 100644
index bb4a3ced177aa82fafc29b99f56f2f4f2a85d29f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/beam_search_kernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BEAM_SEARCH_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class BeamSearchParam : public OpParam {
- public:
-  BeamSearchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    pre_ids_ = GET_VAR_AS_LOD_TENSOR("pre_ids", inputs, *scope);
-    pre_scores_ = GET_VAR_AS_LOD_TENSOR("pre_scores", inputs, *scope);
-    ids_ = GET_VAR_AS_LOD_TENSOR("ids", inputs, *scope);
-    scores_ = GET_VAR_AS_LOD_TENSOR("scores", inputs, *scope);
-
-    selected_ids_ = GET_VAR_AS_LOD_TENSOR("selected_ids", outputs, *scope);
-    selected_scores_ =
-        GET_VAR_AS_LOD_TENSOR("selected_scores", outputs, *scope);
-    if (outputs.count("parent_idx")) {
-      parent_idx_ = GET_VAR_AS_LOD_TENSOR("parent_idx", outputs, *scope);
-    } else {
-      parent_idx_ = new framework::Tensor();
-    }
-
-    level_ = OpParam::GetAttr<int>("level", attrs);
-    beam_size_ = OpParam::GetAttr<int>("beam_size", attrs);
-    end_id_ = OpParam::GetAttr<int>("end_id", attrs);
-    if (OpParam::HasAttr("is_accumulated", attrs)) {
-      is_accumulated_ = OpParam::GetAttr<bool>("is_accumulated", attrs);
-    }
-  }
-
- public:
-  framework::LoDTensor *pre_ids_;
-  framework::LoDTensor *pre_scores_;
-  framework::LoDTensor *ids_;
-  framework::LoDTensor *scores_;
-
-  framework::LoDTensor *selected_ids_;
-  framework::LoDTensor *selected_scores_;
-  framework::Tensor *parent_idx_;
-
-  int level_;
-  int beam_size_;
-  int end_id_;
-  bool is_accumulated_ = true;
-};
-
-DECLARE_KERNEL(BeamSearch, BeamSearchParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // BEAM_SEARCH_OP
diff --git a/mobile/src/operators/kernel/bilinear_interp_kernel.h b/mobile/src/operators/kernel/bilinear_interp_kernel.h
deleted file mode 100644
index 9a68fe65a562a8567dab2e5977506e083f7889a2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/bilinear_interp_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class BilinearInterpKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     BilinearInterpParam<DeviceType>> {
- public:
-  void Compute(const BilinearInterpParam<DeviceType>& param);
-  bool Init(BilinearInterpParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/box_coder_kernel.h b/mobile/src/operators/kernel/box_coder_kernel.h
deleted file mode 100644
index eadb21b3d5ecb95ef82cfef2ac8c3245e925ec7c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/box_coder_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/math/transform.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class BoxCoderKernel
-    : public framework::OpKernelBase<DeviceType, BoxCoderParam<DeviceType>> {
- public:
-  void Compute(const BoxCoderParam<DeviceType>& param);
-  bool Init(BoxCoderParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h b/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
deleted file mode 100644
index 07663ae2ae293cd42ec432de76101c3438972737..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/activation_arm_func.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "operators/math/activation.h"
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, ActivationType Act>
-struct ActivationCompute {
-  void operator()(const Tensor *input, Tensor *output) {}
-  void operator()(const Tensor *input, Tensor *output, float alpha) {}
-};
-
-template <ActivationType Act>
-struct ActivationCompute<float, Act> {
-  void operator()(const Tensor *input, Tensor *output) {
-    const float *x = input->data<float>();
-    float *y = output->mutable_data<float>();
-    size_t remain = input->numel();
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    size_t loop = remain >> 4;
-    remain = remain & 0xF;
-
-#pragma omp parallel for
-    for (size_t i = 0; i < loop; ++i) {
-      const float *local_x = x + (i << 4);
-      float *local_y = y + (i << 4);
-      float32x4_t r0 = vld1q_f32(local_x);
-      float32x4_t r1 = vld1q_f32(local_x + 4);
-      float32x4_t r2 = vld1q_f32(local_x + 8);
-      float32x4_t r3 = vld1q_f32(local_x + 12);
-      r0 = math::vActiveq_f32<Act>(r0);
-      r1 = math::vActiveq_f32<Act>(r1);
-      r2 = math::vActiveq_f32<Act>(r2);
-      r3 = math::vActiveq_f32<Act>(r3);
-      vst1q_f32(local_y, r0);
-      vst1q_f32(local_y + 4, r1);
-      vst1q_f32(local_y + 8, r2);
-      vst1q_f32(local_y + 12, r3);
-    }
-    x += (loop << 4);
-    y += (loop << 4);
-#endif
-    for (size_t i = 0; i < remain; ++i) {
-      y[i] = math::Active<Act>(x[i]);
-    }
-  }
-
-  void operator()(const Tensor *input, Tensor *output, float falpha) {
-    const float *x = input->data<float>();
-    float *y = output->mutable_data<float>();
-    size_t remain = input->numel();
-    float alphas[4] = {falpha, falpha, falpha, falpha};
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-    size_t loop = remain >> 4;
-    remain = remain & 0xF;
-
-#pragma omp parallel for
-    for (size_t i = 0; i < loop; ++i) {
-      const float *local_x = x + (i << 4);
-      float *local_y = y + (i << 4);
-      float32x4_t r0 = vld1q_f32(local_x);
-      float32x4_t r1 = vld1q_f32(local_x + 4);
-      float32x4_t r2 = vld1q_f32(local_x + 8);
-      float32x4_t r3 = vld1q_f32(local_x + 12);
-      float32x4_t a_r0 = vld1q_f32(alphas);
-      float32x4_t a_r1 = vld1q_f32(alphas);
-      float32x4_t a_r2 = vld1q_f32(alphas);
-      float32x4_t a_r3 = vld1q_f32(alphas);
-      r0 = math::vActiveq_f32<Act>(r0, a_r0);
-      r1 = math::vActiveq_f32<Act>(r1, a_r1);
-      r2 = math::vActiveq_f32<Act>(r2, a_r2);
-      r3 = math::vActiveq_f32<Act>(r3, a_r3);
-      vst1q_f32(local_y, r0);
-      vst1q_f32(local_y + 4, r1);
-      vst1q_f32(local_y + 8, r2);
-      vst1q_f32(local_y + 12, r3);
-    }
-    x += (loop << 4);
-    y += (loop << 4);
-#endif
-    for (size_t i = 0; i < remain; ++i) {
-      y[i] = math::Active<Act>(x[i], falpha);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h b/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
deleted file mode 100644
index 300cd32a693853faa5acd945c2b8944a7e316550..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/batchnorm_arm_func.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#pragma once
-
-#include <cmath>
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif  // __ARM_NEON__
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void BatchnormCompute(const BatchNormParam<CPU> &param) {
-  const float epsilon = param.Epsilon();
-  const float *mean_ptr = param.InputMean()->data<float>();
-  const float *variance_ptr = param.InputVariance()->data<float>();
-  const float *scale_ptr = param.InputScale()->data<float>();
-  const float *bias_ptr = param.InputBias()->data<float>();
-
-  const framework::Tensor *input = param.InputX();
-  const float *input_ptr = input->data<float>();
-  framework::Tensor *output = param.OutputY();
-  float *output_ptr = output->mutable_data<float>();
-  size_t spatial_size = output->dims()[2] * output->dims()[3];
-  int channels = output->dims()[1];
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < output->dims()[0]; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      float inv_scale = 1.f / (std::sqrt(variance_ptr[c] + epsilon));
-      float bias = bias_ptr[c] - inv_scale * scale_ptr[c] * mean_ptr[c];
-      float scale = inv_scale * scale_ptr[c];
-      size_t offset = (batch * channels + c) * spatial_size;
-      const float *x = input_ptr + offset;
-      float *y = output_ptr + offset;
-      size_t remain = spatial_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      int loop = spatial_size >> 4;
-      remain = spatial_size & 0xF;
-      float32x4_t __scale = vdupq_n_f32(scale);
-      float32x4_t __bias = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k, x += 16, y += 16) {
-        float32x4_t r0 = vld1q_f32(x);
-        float32x4_t r1 = vld1q_f32(x + 4);
-        float32x4_t r2 = vld1q_f32(x + 8);
-        float32x4_t r3 = vld1q_f32(x + 12);
-        r0 = vmlaq_f32(__bias, __scale, r0);
-        r1 = vmlaq_f32(__bias, __scale, r1);
-        r2 = vmlaq_f32(__bias, __scale, r2);
-        r3 = vmlaq_f32(__bias, __scale, r3);
-        vst1q_f32(y, r0);
-        vst1q_f32(y + 4, r1);
-        vst1q_f32(y + 8, r2);
-        vst1q_f32(y + 12, r3);
-      }
-#endif  // __ARM_NEON__
-      for (int k = 0; k < remain; ++k) {
-        y[k] = scale * x[k] + bias;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h b/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
deleted file mode 100644
index 3840985ab8a963eae7d9a4cf96d9a55acf38f68c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/bilinear_interp_arm_func.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void BilinearInterpCompute(const BilinearInterpParam<CPU>& param) {
-  auto out_dims = param.Out()->dims();
-  auto* input = param.InputX()->data<float>();
-  auto out_size_t = param.InputOutPutSize();
-
-  int out_h = param.OutH();
-  int out_w = param.OutW();
-  if (out_size_t != nullptr) {
-    auto out_size_data = out_size_t->data<int>();
-    out_h = out_size_data[0];
-    out_w = out_size_data[1];
-  }
-  auto* output = param.Out()->mutable_data<float>(
-      {out_dims[0], out_dims[1], out_h, out_w});
-  auto batch_size = param.InputX()->dims()[0];
-  auto channels = param.InputX()->dims()[1];
-  auto in_h = param.InputX()->dims()[2];
-  auto in_w = param.InputX()->dims()[3];
-
-  auto in_hw = in_h * in_w;
-  auto out_hw = out_h * out_w;
-  auto in_chw = channels * in_hw;
-  auto out_chw = channels * out_hw;
-
-  float ratio_h =
-      (out_h > 1) ? static_cast<float>(in_h - 1) / (out_h - 1) : 0.f;
-  float ratio_w =
-      (out_w > 1) ? static_cast<float>(in_w - 1) / (out_w - 1) : 0.f;
-
-  if (in_h == out_h && in_w == out_w) {
-    memcpy(output, input, param.InputX()->numel() * sizeof(float));
-  } else {
-    for (int k = 0; k < batch_size; ++k) {  // loop for batches
-      for (int i = 0; i < out_h; ++i) {     // loop for images
-        int h = ratio_h * i;
-        int hid = (h < in_h - 1) ? 1 : 0;
-        float h1lambda = ratio_h * i - h;
-        float h2lambda = 1.f - h1lambda;
-
-        for (int j = 0; j < out_w; ++j) {
-          int w = ratio_w * j;
-          int wid = (w < in_w - 1) ? 1 : 0;
-          float w1lambda = ratio_w * j - w;
-          float w2lambda = 1.f - w1lambda;
-          // calculate four position for bilinear interpolation
-          const float* in_pos = &input[k * in_chw + h * in_w + w];
-          float* out_pos = &output[k * out_chw + i * out_w + j];
-
-          for (int c = 0; c < channels; ++c) {  // loop for channels
-            // bilinear interpolation
-            out_pos[0] = static_cast<float>(
-                h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[wid]) +
-                h1lambda * (w2lambda * in_pos[hid * in_w] +
-                            w1lambda * in_pos[hid * in_w + wid]));
-            in_pos += in_hw;
-            out_pos += out_hw;
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h b/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
deleted file mode 100644
index 9cdc22cff0bc52d8ae1ff24d619735accd6dca3e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/box_coder_arm_func.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-#pragma once
-
-#include <cmath>
-#include "framework/tensor.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-void EncodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x =
-          (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
-      T target_box_center_y =
-          (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
-      T target_box_width =
-          target_box_data[i * len + 2] - target_box_data[i * len];
-      T target_box_height =
-          target_box_data[i * len + 3] - target_box_data[i * len + 1];
-
-      size_t offset = i * col * len + j * len;
-      output[offset] = (target_box_center_x - prior_box_center_x) /
-                       prior_box_width / prior_box_var_data[j * len];
-      output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                           prior_box_height / prior_box_var_data[j * len + 1];
-      output[offset + 2] =
-          std::log(std::fabs(target_box_width / prior_box_width)) /
-          prior_box_var_data[j * len + 2];
-      output[offset + 3] =
-          std::log(std::fabs(target_box_height / prior_box_height)) /
-          prior_box_var_data[j * len + 3];
-    }
-  }
-}
-
-template <typename T>
-void DecodeCenterSize(const framework::Tensor& target_box,
-                      const framework::Tensor& prior_box,
-                      const framework::Tensor& prior_box_var, T* output) {
-  int64_t row = target_box.dims()[0];
-  int64_t col = prior_box.dims()[0];
-  int64_t len = prior_box.dims()[1];
-
-  auto* target_box_data = target_box.data<T>();
-  auto* prior_box_data = prior_box.data<T>();
-  auto* prior_box_var_data = prior_box_var.data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    for (int64_t j = 0; j < col; ++j) {
-      size_t offset = i * col * len + j * len;
-      T prior_box_width = prior_box_data[j * len + 2] - prior_box_data[j * len];
-      T prior_box_height =
-          prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
-      T prior_box_center_x =
-          (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
-      T prior_box_center_y =
-          (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
-
-      T target_box_center_x = prior_box_var_data[j * len] *
-                                  target_box_data[offset] * prior_box_width +
-                              prior_box_center_x;
-      T target_box_center_y = prior_box_var_data[j * len + 1] *
-                                  target_box_data[offset + 1] *
-                                  prior_box_height +
-                              prior_box_center_y;
-      T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
-                                    target_box_data[offset + 2]) *
-                           prior_box_width;
-      T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
-                                     target_box_data[offset + 3]) *
-                            prior_box_height;
-
-      output[offset] = target_box_center_x - target_box_width / 2;
-      output[offset + 1] = target_box_center_y - target_box_height / 2;
-      output[offset + 2] = target_box_center_x + target_box_width / 2;
-      output[offset + 3] = target_box_center_y + target_box_height / 2;
-    }
-  }
-}
-
-template <typename P>
-void BoxCoderCompute(const BoxCoderParam<CPU>& param) {
-  const auto* input_priorbox = param.InputPriorBox();
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-
-  const auto& code_type = param.CodeType();
-
-  auto row = input_targetbox->dims()[0];
-  auto col = input_priorbox->dims()[0];
-  auto len = input_priorbox->dims()[1];
-
-  framework::Tensor* output_box = param.OutputBox();
-  auto* output_box_dataptr = output_box->mutable_data<float>({row, col, len});
-
-  if (code_type == "encode_center_size") {
-    EncodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-  if (code_type == "decode_center_size") {
-    DecodeCenterSize<float>(*input_targetbox, *input_priorbox,
-                            *input_priorboxvar, output_box_dataptr);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h b/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
deleted file mode 100644
index 4b22857302d11e4a7861282b3088ebe23bea0537..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/concat_arm_func.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-#pragma once
-
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-template <typename T>
-class ConcatFunctor {
- public:
-  void operator()(const std::vector<framework::Tensor> &input, const int axis,
-                  framework::Tensor *output) {
-    size_t num = input.size();
-    int rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-
-    // computation
-    for (int k = 0; k < out_rows; ++k) {
-      T *dst_ptr = output->data<T>() + k * out_cols;
-      int col_idx = 0;
-      for (int j = 0; j < num; ++j) {
-        int col_len = input_cols[j];
-        const T *src_prt = input[j].data<T>() + k * col_len;
-        memory::Copy(dst_ptr + col_idx, src_prt, sizeof(T) * col_len);
-        col_idx += col_len;
-      }
-    }
-  }
-};
-
-template <typename P>
-void ConcatCompute(const ConcatParam<CPU> &param) {
-  auto inputs = param.Inputs();
-  auto *out = param.Out();
-  int axis = param.Axis();
-  out->mutable_data<P>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
-  if (axis == 0 && inputs.size() < 10) {
-    size_t output_offset = 0;
-    for (auto *in : inputs) {
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(out->dims());
-      auto dst = out->data<P>() + output_offset;
-      auto src = in->data<P>();
-      PADDLE_MOBILE_ENFORCE(
-          in_stride.size() == out_stride.size(),
-          "src and dst tensor should have the same dims size.");
-      memory::Copy(dst, src, sizeof(P) * in_stride[0]);
-      output_offset += in_stride[0];
-    }
-  } else {
-    std::vector<framework::Tensor> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = *inputs[j];
-    }
-    ConcatFunctor<P> concat_functor;
-    concat_functor(inputs_concat, axis, out);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
deleted file mode 100644
index 0051fc9ae8cfb57fa6e602422b89f90f930f25a8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_arm_func.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvAddBasic(const FusionConvAddParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  float *biase_data = bias.data<float>();
-
-  int axis = param.Axis();
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMul<float, float>(filter_slice, false, col_matrix, false,
-                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(1), false, biase_data);
-    }
-  }
-}
-
-template <typename P>
-void ConvAddCompute(const FusionConvAddParam<CPU> &param) {
-  param.Output()->mutable_data<float>();
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true, false);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
-    //        param.Paddings(),
-    //                               param.Filter(), param.Bias(),
-    //                               param.Output(), false);
-    if (param.Paddings()[0] == 0) {
-      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 param.Bias(), true, false);
-    } else {
-      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), param.Bias(), true, false);
-    }
-  } else {
-    ConvAddBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
deleted file mode 100644
index 5ee1e251d95d499f368899cf5ed712d498ef7b51..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
-
-template <typename P>
-void ConvAddBNReluCompute(const FusionConvAddBNReluParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    ConvAddBNReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
deleted file mode 100644
index 9f8e885a3160f0c9c04e11736c3feec363ffe8cb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#pragma once
-#include <operators/math/depthwise_conv3x3.h>
-#include <vector>
-#include "operators/math/conv_func.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Itype, typename Otype>
-void ConvAddReluBasic(const FusionConvAddReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor bias = *param.Bias();
-  int32_t axis = param.Axis();
-  Otype *bias_data = bias.data<Otype>();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  float alpha = 1.0f;
-  float beta = 1.0f;
-  int32_t groups = param.Groups();
-  std::vector<int32_t> strides = param.Strides();
-  std::vector<int32_t> paddings = param.Paddings();
-  std::vector<int32_t> dilations = param.Dilations();
-
-  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<Itype>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int32_t>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
-  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
-
-  for (int32_t i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int32_t g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
-                                    paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false, alpha,
-                                 &out_slice, beta, true, bias_data);
-    }
-  }
-}
-
-template <typename Itype, typename Otype>
-void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
-  param.Output()->mutable_data<float>();
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(),
-                               param.Bias(), true, true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //        math::DepthwiseConv3x3(param.Input(), param.Strides(),
-    //        param.Paddings(),
-    //                               param.Filter(), param.Bias(),
-    //                               param.Output(), false);
-    if (param.Paddings()[0] == 0) {
-      math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(),
-                                 param.Bias(), true, true);
-    } else {
-      math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(),
-                                   param.Output(), param.Bias(), true, true);
-    }
-  } else {
-    ConvAddReluBasic<Itype, Otype>(param);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
deleted file mode 100644
index 606a7f1ddc2870562d27efd71b8f70cd921ffa58..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef CONV_OP
-
-#include "operators/kernel/central-arm-func/conv_arm_func.h"
-#include <vector>
-#include "framework/context.h"
-#include "operators/math/depthwise/faster_depthwise_conv3x3.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/depthwise_conv5x5.h"
-#include "operators/math/gemm/gemm1x1s1.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/pad.h"
-#include "operators/math/slidingwindow_conv3x3.h"
-#include "operators/math/vol2col.h"
-#include "operators/math/winograd/winograd_transform.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride) {
-  const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  return output_size;
-}
-
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations) {
-  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
-  for (size_t j = 0; j < strides.size(); ++j) {
-    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
-    strides_1 = strides_1 && (strides[j] == 1);
-    padding_0 = padding_0 && (paddings[j] == 0);
-    dilation_1 = dilation_1 && (dilations[j] == 1);
-  }
-
-  return !(filter_1 && strides_1 && padding_0 && dilation_1);
-}
-
-#ifdef PADDLE_MOBILE_CPU
-template <typename Itype, typename Otype>
-void GemmConv(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  int groups = param.Groups();
-  const std::vector<int> strides = param.Strides();
-  const std::vector<int> paddings = param.Paddings();
-  const std::vector<int> dilations = param.Dilations();
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<Itype>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, Itype> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, Itype> im2col;
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        // col_matrix.ShareDataWith(in_slice);
-        col_matrix = in_slice;
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<Itype, Otype>(filter_slice, false, col_matrix, false,
-                                 static_cast<float>(1), &out_slice,
-                                 static_cast<float>(0), false,
-                                 static_cast<Otype *>(nullptr));
-    }
-  }
-}
-
-template <typename Itype, typename Otype>
-void GemmConv1x1s1(const ConvParam<CPU> &param, const float *bias, bool is_bias,
-                   bool is_relu) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.transformed_filter_;
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  const float *din = input->data<Itype>();
-  float *dout = output->mutable_data<Otype>();
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  const float *weights = filter.mutable_data<float>();
-  int channel_size_out = wout * hout;
-  int channel_size_in = win * hin;
-  const int group = param.Groups();
-  const int m = chout / group;
-  const int n = hout * wout;
-  const int k = chin / group;
-
-  bool flag_relu = true;
-  bool flag_bias = true;
-
-  if (!is_bias) {
-    bias = nullptr;
-    flag_bias = false;
-  }
-  if (!is_relu) {
-    flag_relu = false;
-  }
-  ARMArch arch = framework::CPUContext::Context()->get_arch();
-  int hblock = math::get_hblock(arch);
-
-  int m_roundup = hblock * ((m + hblock - 1) / hblock);
-  int weights_size_per_group = m * k;
-  if (n > 1) {
-    weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
-  }
-
-  for (int b = 0; b < num; ++b) {
-    // dC
-    for (int g = 0; g < group; ++g) {
-      float *dout_group =
-          static_cast<float *>(dout) + (b * chout + g * m) * channel_size_out;
-      const float *din_group = static_cast<const float *>(din) +
-                               (b * chin + g * k) * channel_size_in;
-      const float *weights_group =
-          static_cast<const float *>(weights) + g * weights_size_per_group;
-      const float *bias_group = static_cast<const float *>(bias) + g * m;
-      if (n > 1) {
-        math::sgemm_prepack(weights_group, din_group, bias_group, dout_group, m,
-                            n, k, flag_bias, flag_relu, false, arch);
-      }
-    }
-  }
-}
-
-template <int tile, int kernel>
-void WinogradConv3x3(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.transformed_filter_;
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-  int batch_size = input->dims()[0];
-  int groups = param.Groups();
-  const std::vector<int> &paddings = param.Paddings();
-
-  auto winograd_pad = [&](int width, int pad) {
-    int output_tile = tile - kernel + 1;
-    // int tiles = (width + pad - kernel) / output_tile + 1;
-    // return (tiles - 1) * output_tile + tile - width;
-    int pad_width = (width + 2 * pad - kernel) / output_tile * output_tile;
-    return pad_width + tile - width;
-  };
-
-  math::PadFunctor<CPU, float> pad;
-  Tensor input_pad;
-  framework::Tensor transformed_input;
-  for (int i = 0; i < batch_size; ++i) {
-    Tensor in_batch = input->Slice(i, i + 1);
-    Tensor out_batch = output->Slice(i, i + 1);
-    // int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
-    // int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
-    int pad_bottom = paddings[0];
-    int pad_right = paddings[1];
-    if (paddings[0] || paddings[1] || pad_bottom || pad_right) {
-      framework::DDim pad_shape = in_batch.dims();
-      pad_shape[2] += paddings[0] + pad_bottom;
-      pad_shape[3] += paddings[1] + pad_right;
-      input_pad.mutable_data<float>(pad_shape);
-      pad(in_batch, paddings[0], pad_bottom, paddings[1], pad_right,
-          &input_pad);
-    } else {
-      input_pad = in_batch;
-    }
-    // tile input and transform
-    math::winograd_transform_input<tile, kernel>(input_pad, &transformed_input);
-    // caculate output
-    math::winograd_transform_output<tile, kernel>(transformed_input, *filter,
-                                                  output);
-  }
-}
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  if (strides[0] == 1) {
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv3x3S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    }
-  } else if (strides[0] == 2) {
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv3x3S2<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    }
-  } else {
-    GemmConv<Itype, Otype>(param);
-  }
-}
-
-void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
-                                      const float *bias, bool flag_relu) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int pad = paddings[0];
-  int stride = strides[0];
-  const float *din = input->data<float>();
-  float *dout = output->mutable_data<float>();
-  const float *weights = filter->data<float>();
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  bool flag_bias = bias != nullptr;
-  if (pad == 1) {
-    math::depthwise::conv_depthwise_3x3p1(din, dout, num, chout, hout, wout,
-                                          chin, hin, win, weights, bias, stride,
-                                          flag_bias, flag_relu);
-  }
-}
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5(const ConvParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  const int batch_size = input->dims()[0];
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  if (strides[0] == 1) {
-    for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice(i, i + 1);
-      Tensor out_batch = output->Slice(i, i + 1);
-      math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
-                                             &out_batch);
-    }
-  } else {
-    GemmConv<Itype, Otype>(param);
-  }
-}
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3(const ConvParam<CPU> &param, const float *bias,
-                          bool is_bias, bool is_relu) {
-  const Tensor *input = param.Input();
-  const Tensor *filter = param.Filter();
-  const std::vector<int> &paddings = param.Paddings();
-  const std::vector<int> &strides = param.Strides();
-  Tensor *output = param.Output();
-  output->mutable_data<Otype>();
-
-  if (strides[0] == 1) {
-    // math::SlidingwindowConv3x3s1<Itype, Otype>(input, filter, paddings,
-    // output);
-    math::SlidingwindowConv3x3s1Faster<Itype, Otype>(
-        input, param.transformed_filter_, paddings, output, bias, is_bias,
-        is_relu);
-  } else if (strides[0] == 2) {
-    // math::SlidingwindowConv3x3s2<Itype, Otype>(input, filter, paddings,
-    // output);
-    math::SlidingwindowConv3x3s2Faster<Itype, Otype>(
-        input, param.transformed_filter_, paddings, output, bias, is_bias,
-        is_relu);
-  } else {
-    GemmConv<Itype, Otype>(param);
-  }
-}
-
-template void GemmConv<float, float>(const ConvParam<CPU> &param);
-template void GemmConv1x1s1<float, float>(const ConvParam<CPU> &param,
-                                          const float *bias, bool is_bias,
-                                          bool is_relu);
-template void WinogradConv3x3<8, 3>(const ConvParam<CPU> &param);
-template void DepthwiseConv3x3<float, float>(const ConvParam<CPU> &param);
-template void DepthwiseConv5x5<float, float>(const ConvParam<CPU> &param);
-template void SlidingwindowConv3x3<float, float>(const ConvParam<CPU> &param,
-                                                 const float *bias,
-                                                 bool is_bias, bool is_relu);
-
-template void GemmConv<int8_t, int32_t>(const ConvParam<CPU> &param);
-#ifndef __aarch64__
-template void DepthwiseConv3x3<int8_t, int32_t>(const ConvParam<CPU> &param);
-template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
-#endif
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
deleted file mode 100644
index 89b91f9d11de781993e357117e4188429513960f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-int ConvOutputSize(int input_size, int filter_size, int dilation, int padding,
-                   int stride);
-
-bool IsExpand(const std::vector<int64_t> &filter_dim,
-              const std::vector<int> &strides, const std::vector<int> &paddings,
-              const std::vector<int> &dilations);
-
-template <typename Itype, typename Otype>
-void GemmConv(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void GemmConv1x1s1(const ConvParam<CPU> &param, const float *bias, bool is_bias,
-                   bool is_relu);
-
-template <int tile, int kernel>
-void WinogradConv3x3(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5(const ConvParam<CPU> &param);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3(const ConvParam<CPU> &param, const float *bias,
-                          bool is_bias, bool is_relu);
-
-void FasterDepthwiseConv3x3_bias_relu(const ConvParam<CPU> &param,
-                                      const float *bias, bool flag_relu);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
deleted file mode 100644
index 1ff51aa39c880e1619af4e158bc77815e0dc1278..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#pragma once
-
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-  Tensor *bias1 = param.Bias();
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-    Tensor bias_batch = bias1->Slice(i, i + 1).Resize(output_matrix_shape);
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      Tensor bias_data = bias_batch.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(1), true, &new_scale, &new_bias, g,
-                         bias_data.data<float>());
-    }
-  }
-}
-template <typename P>
-void ConvBNAddReluCompute(const FusionConvBNAddReluParam<CPU> &param) {
-  Tensor Bias;
-  Bias.mutable_data<float>({param.Groups()});
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    ConvBNAddReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
deleted file mode 100644
index 5606eb3304ac1384fdbd5c3899b6ad3186d315b6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#pragma once
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
-
-template <typename P>
-void ConvBNReluCompute(const FusionConvBNReluParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    ConvBNReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h b/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
deleted file mode 100644
index 33ceefadd85e98da76cc90292bf2f066bd3caace..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ConvTransposeCompute(const ConvTransposeParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor *output = param.Output();
-  output->mutable_data<P>();
-
-  auto strides = param.Strides();
-  auto paddings = param.Paddings();
-  auto dilations = param.Dilations();
-  auto groups = param.Groups();
-
-  const int batch_size = input->dims()[0];
-
-  std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
-  std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
-
-  size_t data_dim = filter_shape_vec.size() - 2;
-
-  // 5 或者 7
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-
-  // output c / groups
-  col_shape_vec[0] = output->dims()[1] / groups;
-  for (size_t i = 0; i < data_dim; ++i) {
-    // filter shape  filter h  filter w
-    col_shape_vec[i + 1] = filter_shape_vec[i + 2];
-    // input shape  input h  input w
-    col_shape_vec[i + 1 + data_dim] = input_shape_vec[i + 2];
-  }
-
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  Tensor col;
-  col.mutable_data<P>(col_shape);
-
-  Tensor col_matrix;
-  col_matrix.ShareDataWith(col);
-  col_matrix.Resize(col_matrix_shape);
-
-  framework::DDim output_shape =
-      framework::slice_ddim(output->dims(), 1, output->dims().size());
-
-  framework::DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
-
-  // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w)
-  framework::DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
-  filter.Resize(filter_matrix_shape);
-
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Col2ImFunctor<math::ColFormat::kCFO, CPU, P> col2im;
-  math::Col2VolFunctor<CPU, P> col2vol;
-
-  for (int i = 0; i < batch_size; ++i) {
-    Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
-    Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
-
-    for (int g = 0; g < groups; ++g) {
-      Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step);
-      Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step);
-      Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step);
-
-      math::MatMul<P, P>(filter_slice, true, in_slice, false,
-                         static_cast<P>(1.0), &col_matrix, static_cast<P>(0.0));
-      if (data_dim == 2U) {
-        col2im(col, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &out_slice);
-      } else if (data_dim == 3U) {
-        col2vol(col, dilations, strides, paddings, &out_slice);
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h b/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
deleted file mode 100644
index 2cf95081e9678325046d49f86ebf072a14a76795..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/crf_arm_func.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-#pragma once
-
-#include <limits>
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename P>
-void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
-            Tensor* decoded_path) {
-  auto emission_dims = emission_weights.dims();
-  const size_t seq_len = emission_dims[0];
-  const size_t tag_num = emission_dims[1];
-
-  const size_t state_trans_base_idx = 2;
-
-  const P* x = emission_weights.data<P>();
-  const P* w = transition_weights.data<P>();
-  int64_t* path = decoded_path->data<int64_t>();
-
-  // alpha is a memo table. An element alpha(k, v) records the score of the
-  // best sequence of tags from position 1 to position k with v being the end
-  // tag.
-  Tensor alpha;
-  P* alpha_value = alpha.mutable_data<P>(emission_dims);
-  Tensor track;
-  int* track_value = track.mutable_data<int>(emission_dims);
-  for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
-
-  for (size_t k = 1; k < seq_len; ++k) {
-    for (size_t i = 0; i < tag_num; ++i) {
-      P max_score = -std::numeric_limits<P>::max();
-      int max_j = 0;
-      for (size_t j = 0; j < tag_num; ++j) {
-        P score = alpha_value[(k - 1) * tag_num + j] +
-                  w[(j + state_trans_base_idx) * tag_num + i];
-        if (score > max_score) {
-          max_score = score;
-          max_j = j;
-        }
-      }
-
-      alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
-      track_value[k * tag_num + i] = max_j;
-    }
-  }
-  P max_score = -std::numeric_limits<P>::max();
-  int max_i = 0;
-  for (size_t i = 0; i < tag_num; ++i) {
-    P score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
-    if (score > max_score) {
-      max_score = score;
-      max_i = i;
-    }
-  }
-  path[seq_len - 1] = max_i;
-  for (int k = seq_len - 1; k >= 1; --k) {
-    path[k - 1] = max_i = track_value[k * tag_num + max_i];
-  }
-}
-template <typename P>
-void CrfCompute(const CrfParam<CPU>& param) {
-  auto* emission = param.InputEmission();
-  auto* transition = param.InputTransition();
-  auto* label = param.InputLabel();
-  auto* decoded_path = param.outputVBP();
-  //  DLOG<<*emission;
-  //  DLOG<<*transition;
-  //  DLOG<<*label;
-
-  PADDLE_MOBILE_ENFORCE(emission->NumLevels() == 1U,
-                        "The Input(Emission) should be a sequence.");
-  auto lod = emission->lod();
-  PADDLE_MOBILE_ENFORCE(lod.size(),
-                        "The Input(Emission) should be a sequence.");
-  const size_t level = 0;
-  const size_t seq_num = lod[level].size() - 1;
-  int64_t* path = decoded_path->mutable_data<int64_t>();
-  int numel = decoded_path->numel();
-  memset(static_cast<void*>(path), 0, sizeof(int64_t) * numel);
-  for (size_t i = 0; i < seq_num; ++i) {
-    int start_pos = static_cast<int>(lod[level][i]);
-    int end_pos = static_cast<int>(lod[level][i + 1]);
-    Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
-    Decode<P>(emission->Slice(start_pos, end_pos), *transition,
-              &decoded_path_one_seq);
-  }
-  if (label) {
-    PADDLE_MOBILE_ENFORCE(label->NumLevels() == 1U,
-                          "The Input(Label) should be a sequence.");
-    const int64_t* label_value = label->data<int64_t>();
-    size_t batch_size = emission->dims()[0];
-    for (size_t i = 0; i < batch_size; ++i) {
-      path[i] = label_value[i] == path[i] ? 1 : 0;
-    }
-  }
-}
-}  // namespace operators
-
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h b/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
deleted file mode 100644
index 7e4c3599d06ef1b4051f95ae6aebdd7badc64f7c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/density_prior_box_arm_func.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DENSITY_PRIORBOX_OP
-#pragma once
-
-#include <operators/kernel/prior_box_kernel.h>
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename P>
-void DensityPriorBoxCompute(const DensityPriorBoxParam<CPU> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  auto densities = param.Densities();
-  auto fixed_ratios = param.FixedRatios();
-
-  auto fixed_sizes = param.FixedSizes();
-
-  const auto &variances = param.Variances();
-  const bool &clip = param.Clip();
-
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = 0;
-  for (size_t i = 0; i < densities.size(); ++i) {
-    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-  }
-
-  auto box_dim = output_variances->dims();
-
-  output_boxes->Resize({feature_height, feature_width, num_priors, 4});
-  int step_average = static_cast<int>((step_width + step_height) * 0.5);
-
-  std::vector<float> sqrt_fixed_ratios;
-  for (size_t i = 0; i < fixed_ratios.size(); i++) {
-    sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i]));
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      int idx = 0;
-      for (size_t s = 0; s < fixed_sizes.size(); ++s) {
-        auto fixed_size = fixed_sizes[s];
-        int density = densities[s];
-        int shift = step_average / density;
-        // Generate density prior boxes with fixed ratios.
-        for (size_t r = 0; r < fixed_ratios.size(); ++r) {
-          float box_width_ratio = fixed_size * sqrt_fixed_ratios[r];
-          float box_height_ratio = fixed_size / sqrt_fixed_ratios[r];
-          float density_center_x = center_x - step_average / 2. + shift / 2.;
-          float density_center_y = center_y - step_average / 2. + shift / 2.;
-          for (int di = 0; di < density; ++di) {
-            for (int dj = 0; dj < density; ++dj) {
-              float center_x_temp = density_center_x + dj * shift;
-              float center_y_temp = density_center_y + di * shift;
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   0] =
-                  std::max((center_x_temp - box_width_ratio / 2.) / img_width,
-                           0.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   1] =
-                  std::max((center_y_temp - box_height_ratio / 2.) / img_height,
-                           0.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   2] =
-                  std::min((center_x_temp + box_width_ratio / 2.) / img_width,
-                           1.);
-              output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                   3] =
-                  std::min((center_y_temp + box_height_ratio / 2.) / img_height,
-                           1.);
-              idx++;
-            }
-          }
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
deleted file mode 100644
index 150485032491ab9b5051ee0bc458ff8ca2a700e2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#pragma once
-#include <vector>
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
-  const Tensor *input = param.Input();
-  Tensor filter = *param.Filter();
-  Tensor new_bias = *param.NewBias();
-  Tensor new_scale = *param.NewScale();
-
-  Tensor *output = param.Output();
-  output->mutable_data<float>();
-
-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
-
-  const int batch_size = static_cast<int>(input->dims()[0]);
-
-  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
-
-  std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
-  size_t data_dim = filter_shape_vec.size() - 2;
-  std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
-  col_shape_vec[0] = input->dims()[1] / groups;
-  for (size_t j = 0; j < data_dim; ++j) {
-    col_shape_vec[j + 1] = filter_shape_vec[j + 2];
-    col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
-  }
-  framework::DDim col_shape(framework::make_ddim(col_shape_vec));
-
-  framework::DDim col_matrix_shape =
-      framework::flatten_to_2d(col_shape, data_dim + 1);
-
-  bool is_expand =
-      math::IsExpand(filter_shape_vec, strides, paddings, dilations);
-  Tensor col;
-  Tensor col_matrix;
-  if (is_expand) {
-    col.mutable_data<float>(col_shape);
-    col_matrix.ShareDataWith(col);
-    col_matrix.Resize(col_matrix_shape);
-  }
-
-  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
-
-  framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                         filter.numel() / filter.dims()[0]};
-  filter.Resize(filter_matrix_shape);
-  framework::DDim output_matrix_shape = {
-      output->dims()[1],
-      output->numel() / (output->dims()[0] * output->dims()[1])};
-
-  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
-
-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
-
-  for (int i = 0; i < batch_size; i++) {
-    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
-    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
-
-    for (int g = 0; g < groups; g++) {
-      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
-
-      if (!is_expand) {
-        col.ShareDataWith(in_slice);
-        col_matrix.ShareDataWith(col);
-        col_matrix.Resize(col_matrix_shape);
-      } else if (data_dim == 2U) {
-        // im2col
-        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
-               &col);
-      } else if (data_dim == 3U) {
-        // vol2col
-        vol2col(in_slice, dilations, strides, paddings, &col);
-      }
-      // gemm
-      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
-      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::MatMulWithBn(filter_slice, false, col_matrix, false,
-                         static_cast<float>(1), &out_slice,
-                         static_cast<float>(0), true, &new_scale, &new_bias, g);
-    }
-  }
-}
-template <typename P>
-void DWConvBNReluCompute(const FusionDWConvBNReluParam<CPU> &param) {
-  if (param.Groups() == param.Input()->dims()[1] &&
-      param.Input()->dims()[1] == param.Output()->dims()[1] &&
-      param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-      param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1 &&
-      param.paddings_[0] == 1) {
-    math::DepthwiseConvAddBNRelu3x3s1p1(param.Input(), param.Filter(),
-                                        param.Output(), param.NewScale(),
-                                        param.NewBias(), true);
-  } else if (param.Groups() == param.Input()->dims()[1] &&
-             param.Input()->dims()[1] == param.Output()->dims()[1] &&
-             param.Filter()->dims()[2] == param.Filter()->dims()[3] &&
-             param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) {
-    //    math::DepthwiseConvAddBNRelu3x3s2p1(param.Input(), param.Filter(),
-    //                                        param.Output(), param.NewScale(),
-    //                                        param.NewBias(), 1);
-    math::DepthwiseConvAddBNRelu3x3s2p1v2(param.Input(), param.Filter(),
-                                          param.Output(), param.NewScale(),
-                                          param.NewBias(), true);
-  } else {
-    DWConvBNReluBasic(param);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
deleted file mode 100644
index 877ae712cf6e593337a4f46645c394be6d7da9ee..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_add_arm_func.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "operators/math/element_wise.h"
-#include "operators/op_param.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-inline void ElementwiseAddCompute(const ElementwiseAddParam<CPU> &param) {
-  const framework::Tensor *input_x = param.InputX();
-  const framework::Tensor *input_y = param.InputY();
-  framework::Tensor *output = param.Out();
-  int axis = param.Axis();
-  math::AddElememtWise<IDENTITY>(input_x, input_y, axis, output);
-}
-
-template <typename Dtype, ActivationType Act>
-struct AddElememtWiseStruct {
-  void operator()(const Tensor *X, const Tensor *Y, const int Axis,
-                  Tensor *Out) {}
-};
-
-template <ActivationType Act>
-struct AddElememtWiseStruct<int, Act> {
-  void operator()(const Tensor *input, const Tensor *bias, const int Axis,
-                  Tensor *output) {
-    const auto &x_dims = input->dims();
-    const auto &y_dims = bias->dims();
-    const int *input_data = input->data<int>();
-    const int *bias_data = bias->data<int>();
-    int *output_data = output->mutable_data<int>();
-
-    if (x_dims == y_dims) {
-      size_t channels = 1;
-      size_t elementwise_num = 1;
-      for (int i = 0; i < y_dims.size(); ++i) {
-        channels *= y_dims[i];
-      }
-#pragma omp parallel for
-      for (int j = 0; j < channels; ++j) {
-        size_t offset = (0 * channels + j) * elementwise_num;
-        const int *input = input_data + offset;
-        const int bias = bias_data[j];
-        int *output = output_data + offset;
-        for (int k = 0; k < elementwise_num; ++k) {
-          output[k] = math::Active<Act>(input[k] + bias);
-        }
-      }
-    }
-  }
-};
-
-template class ElementwiseAddKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
deleted file mode 100644
index 0aed7ff8d4f7abbe64de288e4f22d3b691a23bbc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_mul_arm_func.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#pragma once
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline T operator()(T a, T b) const { return a * b; }
-};
-
-template <typename P>
-void ElementwiseMulCompute(const ElementwiseMulParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *Out = param.Out();
-  Out->mutable_data<float>();
-  int axis = param.Axis();
-  ElementwiseComputeEx<MulFunctor<float>, float>(input_x, input_y, axis,
-                                                 MulFunctor<float>(), Out);
-}
-
-template class ElementwiseMulKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h b/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
deleted file mode 100644
index cb5bbc91c3b2cede812d28c77e669ddbe46078bf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/elementwise_sub_arm_func.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#pragma once
-
-#include "framework/data_type.h"
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct SubFunctor {
-  inline T operator()(T a, T b) const { return a - b; }
-};
-
-struct SubOpFunctor {
-  const framework::Tensor* x_;
-  const framework::Tensor* y_;
-  const int axis_;
-  framework::Tensor* out_;
-
-  SubOpFunctor(const framework::Tensor* x, const framework::Tensor* y,
-               framework::Tensor* out, const int axis)
-      : x_(x), y_(y), out_(out), axis_(axis) {}
-
-  template <typename T>
-  void apply() const {
-    out_->mutable_data<T>();
-    ElementwiseComputeEx<SubFunctor<T>, T>(x_, y_, axis_, SubFunctor<T>(),
-                                           out_);
-  }
-};
-
-template <typename P>
-void ElementwiseSubCompute(const ElementwiseSubParam<CPU>& param) {
-  const Tensor* input_x = param.InputX();
-  const Tensor* input_y = param.InputY();
-  Tensor* out = param.Out();
-
-  int axis = param.Axis();
-  framework::VisitDataType(framework::ToDataType(input_x->type()),
-                           SubOpFunctor(input_x, input_y, out, axis));
-}
-
-template class ElementwiseSubKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h b/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
deleted file mode 100644
index 396658013310a84c763f90f7cec515fba4fd7e4e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/flatten_arm_func.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#ifndef RESHAPE_OP
-#define RESHAPE_OP
-#endif
-
-#pragma once
-
-#include <operators/kernel/reshape_kernel.h>
-#include <vector>
-#include "operators/flatten_op.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void FlattenCompute(const FlattenParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  const auto axis = param.Axis();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-
-  const auto &out_shape_v = GetOutputShape(axis, input_x_dims);
-  const framework::DDim &out_dim = ValidateShape(out_shape_v, input_x_dims);
-
-  out->Resize(out_dim);
-  out->mutable_data<float>();
-  framework::TensorCopy(*input_x, out);
-  out->Resize(out_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h b/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
deleted file mode 100644
index 9adc4a273a0892816ee17ad769f24966e035ce04..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#pragma once
-
-#include <type_traits>
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Itype, typename Otype>
-void FusionFcCompute(const FusionFcParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *input_z = param.InputZ();
-  Otype *input_z_data = input_z->data<Otype>();
-  int axis = param.Axis();
-  Tensor *out = param.Out();
-  auto *out_data = out->mutable_data<Itype>();
-  int M = (int)input_x->dims()[0];
-
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
-
-  // bias_data的维度和out的第二个维度一致
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(Otype) * classes);
-  }
-  if (M == 1) {
-    math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, true,
-                               static_cast<float>(1), out,
-                               static_cast<float>(1), false);
-  } else {
-    math::MatMul<Itype, Otype>(x_matrix, false, y_matrix, false,
-                               static_cast<float>(1), out,
-                               static_cast<float>(1), false);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h b/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
deleted file mode 100644
index 897538273232b4379b93dbb34651906e3bc9058c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/gru_arm_func.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-#pragma once
-
-#include <operators/math/sequence2batch.h>
-#include <vector>
-#include "common/types.h"
-#include "operators/math/gru_compute.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Device, typename T>
-inline void ReorderInitState(const framework::Tensor& src,
-                             std::vector<size_t> index_lod,
-                             framework::Tensor* dst, bool indexed_src) {
-  math::CopyMatrixRowsFunctor<Device, T> row_shuffle;
-  dst->mutable_data<T>(src.dims());
-  row_shuffle(src, index_lod, dst, indexed_src);
-}
-
-template <typename T>
-void GruCompute(const GruParam<CPU>& param) {
-  auto* input = param.InputInput();
-  auto* h0 = param.InputH0();
-  auto* weight = param.InputWeight();
-  const auto* weight_data = weight->data<float>();
-  auto* bias = param.InputBias();
-  auto* batch_gate = param.OutBatchGate();
-  batch_gate->mutable_data<float>();
-  auto* batch_reset_hidden_prev = param.OutBatchResetHiddenPrev();
-  batch_reset_hidden_prev->mutable_data<float>();
-  auto* batch_hidden = param.OutBatchHidden();
-  batch_hidden->mutable_data<float>();
-  auto* hidden = param.OutHidden();
-  hidden->mutable_data<float>();
-
-  auto hidden_dims = hidden->dims();
-
-  bool is_reverse = param.IsReverse();
-  math::LoDTensor2BatchFunctor<CPU, float> to_batch;
-  to_batch(*input, batch_gate, true, is_reverse);
-  if (bias) {
-    math::RowwiseAdd<CPU, float> add_bias;
-    add_bias(*batch_gate, *bias, batch_gate);
-  }
-  int frame_size = hidden_dims[1];
-  math::GRUMetaValue<float> gru_value;
-  gru_value.gate_weight = const_cast<float*>(weight_data);
-  gru_value.state_weight =
-      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
-  framework::Tensor ordered_h0;
-  std::vector<size_t> order(batch_gate->lod()[2]);
-  if (h0) {
-    // Since the batch computing for GRU reorders the input sequences
-    // according to their length. The initialized cell state also needs
-    // to reorder.
-    ReorderInitState<CPU, float>(*h0, order, &ordered_h0, true);
-    gru_value.prev_out_value = ordered_h0.data<float>();
-  } else {
-    gru_value.prev_out_value = nullptr;
-  }
-  auto batch_starts = batch_gate->lod()[0];
-  size_t seq_len = batch_starts.size() - 1;
-  auto active_node = math::GetActivationType(param.Activation());
-  auto active_gate = math::GetActivationType(param.GateActivation());
-  for (size_t n = 0; n < seq_len; n++) {
-    int bstart = static_cast<int>(batch_starts[n]);
-    int bend = static_cast<int>(batch_starts[n + 1]);
-    int cur_batch_size = bend - bstart;
-    framework::Tensor gate_t = batch_gate->Slice(bstart, bend);
-    framework::Tensor reset_hidden_prev_t =
-        batch_reset_hidden_prev->Slice(bstart, bend);
-    framework::Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-    gru_value.output_value = hidden_t.data<float>();
-    gru_value.gate_value = gate_t.data<float>();
-    gru_value.reset_output_value = reset_hidden_prev_t.data<float>();
-
-    math::GRUUnitFunctor<CPU, float>::compute(
-        gru_value, frame_size, cur_batch_size, active_node, active_gate);
-
-    gru_value.prev_out_value = gru_value.output_value;
-  }
-  math::Batch2LoDTensorFunctor<CPU, float> to_seq;
-  batch_hidden->set_lod(batch_gate->lod());
-  to_seq(*batch_hidden, hidden);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // GRU_OP
diff --git a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h b/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
deleted file mode 100644
index 568273e8738acbc5735397127148a507ab8ae26e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/gru_unit_arm_func.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#pragma once
-
-#include <operators/math/gru_compute.h>
-#include "operators/kernel/activation_kernel.h"
-#include "operators/math/gemm.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void GruUnitCompute(const GruUnitParam<CPU>& param) {
-  // inputs
-  auto* input = param.InputInput();
-  auto* hidden_prev = param.InputHiddenPrev();
-  auto* weight = param.InputWeight();
-  auto* bias = param.InputBias();
-  // outputs
-  auto* gate = param.OutGate();
-  gate->mutable_data<P>();
-  auto* reset_hidden_prev = param.OutResetHiddenPrev();
-  reset_hidden_prev->mutable_data<P>();
-  auto* hidden = param.OutHidden();
-  hidden->mutable_data<P>();
-
-  // add bias
-  if (bias) {
-    math::RowwiseAdd<CPU, float> add_bias;
-    add_bias(*input, *bias, gate);
-  }
-
-  int batch_size = input->dims()[0];
-  int frame_size = hidden_prev->dims()[1];
-  const P* weight_data = weight->data<P>();
-
-  math::GRUMetaValue<P> gru_value;
-  gru_value.gate_weight = const_cast<P*>(weight_data);
-  gru_value.state_weight =
-      const_cast<P*>(weight_data + 2 * frame_size * frame_size);
-  gru_value.prev_out_value = const_cast<P*>(hidden_prev->data<P>());
-
-  gru_value.output_value = hidden->data<P>();
-  gru_value.gate_value = gate->data<P>();
-  gru_value.reset_output_value = reset_hidden_prev->data<P>();
-
-  auto active_node = math::GetActivationType(param.Activation());
-  auto active_gate = math::GetActivationType(param.GateActivation());
-  math::GRUUnitFunctor<CPU, float>::compute(gru_value, frame_size, batch_size,
-                                            active_node, active_gate);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h b/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
deleted file mode 100644
index 96473fef81da3e29b70270bed8456d408b31f736..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/increment_arm_func.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#pragma once
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void IncrementCompute(const IncrementParam<CPU> &param) {
-  const framework::Tensor *input = param.InputX();
-  framework::Tensor *out = param.Out();
-  float step = param.Step();
-
-  out->mutable_data<int64_t>();
-  const int64_t *input_data = input->data<int64_t>();
-  int64_t *out_data = out->data<int64_t>();
-  *out_data = *input_data + step;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h b/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
deleted file mode 100644
index 917973822f90b5015ea6b49aef0b7437ce8988e1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/lookup_arm_func.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-#pragma once
-
-#include <vector>
-#include "framework/ddim.h"
-#include "operators/op_param.h"
-
-constexpr int64_t kNoPadding = -1;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void LookupCompute(const LookupParam<CPU> &param) {
-  auto *ids_t = param.InputIds();
-  auto *table_t = param.InputW();
-  auto *output_t = param.Out();
-  int64_t padding_idx = param.PaddingIdx();
-  const framework::DDim &table_dim = table_t->dims();
-  int64_t ids_numel;
-  const auto *ids = ids_t->data<int64_t>();
-  ids_numel = ids_t->numel();
-  int64_t row_number = table_t->dims()[0];
-  int64_t row_width = table_t->dims()[1];
-  auto *table = table_t->data<float>();
-  auto *output = output_t->mutable_data<float>();
-  for (int64_t i = 0; i < ids_numel; ++i) {
-    if (padding_idx != kNoPadding && ids[i] == padding_idx) {
-      memset(output + i * row_width, 0, row_width * sizeof(float));
-    } else {
-      PADDLE_MOBILE_ENFORCE(ids[i] < row_number,
-                            "look uptable ids[i] <row_number check failed");
-      PADDLE_MOBILE_ENFORCE(ids[i] >= 0,
-                            "lookuptable ids[i] >= 0 check failed");
-
-      memcpy(output + i * row_width, table + ids[i] * row_width,
-             row_width * sizeof(float));
-    }
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h b/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
deleted file mode 100644
index 165ad8dd8af1f8a09f16b1737fbaff363cfaf5bd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/lrn_arm_func.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#pragma once
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void LrnCompute(const LrnParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  auto x_dims = input_x->dims();
-  Tensor *out = param.Out();
-  out->mutable_data<float>();
-  /// data_format = NCHW
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  LRNFunctor<float> lrnFunctor;
-  lrnFunctor(*input_x, out, N, C, H, W, n, k, alpha, beta);
-}
-
-template class LrnKernel<CPU, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h b/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
deleted file mode 100644
index 01d668021ba1affa4d827fdacbadbac7a66c794a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void MulCompute(const MulParam<CPU> &param) {
-  const Tensor *input_x = param.InputX();
-  const Tensor *input_y = param.InputY();
-  Tensor *out = param.Out();
-
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  if (param.InputX()->type() == type_id<int8_t>().hash_code()) {
-    out->mutable_data<int32_t>();
-    math::MatMul<int8_t, int32_t>(x_matrix, false, y_matrix, false,
-                                  static_cast<float>(1), out,
-                                  static_cast<float>(0));
-  } else {
-    out->mutable_data<float>();
-    math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                               static_cast<float>(1), out,
-                               static_cast<float>(0));
-  }
-  if (out_dim.size() != 2) {
-    out->Resize(out_dim);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h b/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
deleted file mode 100644
index f44f348aa640fd2c3176ca365ca4c120ff570b95..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/multiclass_nms_arm_func.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-#pragma once
-
-#include <algorithm>
-#include <map>
-#include <utility>
-#include <vector>
-#include "framework/tensor.h"
-#include "operators/math/poly_util.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
-                        const bool normalized) {
-  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
-  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
-  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are is invalid
-    // if area size <= 0,  return 0.
-    return static_cast<T>(0.);
-  } else {
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline void NMSFast(const framework::Tensor& bbox,
-                           const framework::Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = T(0.);
-        if (box_size == 4) {
-          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        } else {
-          overlap = PolyIoU<T>(bbox_data + idx * box_size,
-                               bbox_data + kept_idx * box_size, box_size, true);
-        }
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const framework::Tensor& scores,
-                   const framework::Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    framework::Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const framework::Tensor& scores,
-                      const framework::Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      framework::Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  int box_size = bboxes.dims()[1];
-  int out_dim = bboxes.dims()[1] + 2;
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * box_size;
-      odata[count * out_dim] = label;           // label
-      odata[count * out_dim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-      count++;
-    }
-  }
-}
-
-template <typename P>
-void MultiClassNMSCompute(const MultiClassNMSParam<CPU>& param) {
-  const auto* input_bboxes = param.InputBBoxes();
-  const auto& input_bboxes_dims = input_bboxes->dims();
-
-  const auto* input_scores = param.InputScores();
-  const auto& input_scores_dims = input_scores->dims();
-
-  auto* outs = param.Out();
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-
-    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    int64_t out_dim = box_dim + 2;
-    outs->mutable_data<float>({num_kept, out_dim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        framework::Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-
-  framework::LoD lod;
-  lod.emplace_back(batch_starts);
-
-  outs->set_lod(lod);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h b/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
deleted file mode 100644
index 71b4c5515e9493def7c8d824e61917dfc8d1b985..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/norm_arm_func.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#pragma once
-
-#include <cmath>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline void GetDims(const framework::DDim &dim, int axis, int *pre, int *n,
-                    int *post) {
-  *pre = 1;
-  *post = 1;
-  *n = dim[axis];
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= dim[i];
-  }
-  for (int i = axis + 1; i < dim.size(); ++i) {
-    (*post) *= dim[i];
-  }
-}
-
-template <typename P>
-void NormCompute(const NormParam<CPU> &param) {
-  const float epsilon = param.Epsilon();
-  int axis = param.Axis();
-
-  const framework::Tensor *input = param.InputX();
-  framework::Tensor *norm = param.OutputNorm();
-  framework::Tensor *out = param.Out();
-
-  auto x_dims = input->dims();
-  if (axis < 0) {
-    axis += x_dims.size();
-  }
-
-  int pre, n, post;
-  GetDims(x_dims, axis, &pre, &n, &post);
-
-  const float *input_ptr = input->data<float>();
-  float *norm_ptr = norm->mutable_data<float>();
-  float *out_ptr = out->mutable_data<float>();
-
-  for (int p = 0; p < pre; ++p) {
-    const float *in_tmp = input_ptr + p * n * post;
-    float *norm_tmp = norm_ptr + p * post;
-
-    // in_ch = 0; norm = epsilon + x * x
-    for (int i = 0; i < post; ++i) {
-      *norm_tmp = epsilon;
-      *norm_tmp += (*in_tmp) * (*in_tmp);
-      norm_tmp++;
-      in_tmp++;
-    }
-
-    // in_ch >= 1; norm += x * x
-    for (int c = 1; c < n; ++c) {
-      norm_tmp = norm_ptr + p * post;
-      for (int i = 0; i < post; ++i) {
-        *norm_tmp += (*in_tmp) * (*in_tmp);
-        norm_tmp++;
-        in_tmp++;
-      }
-    }
-
-    // norm = sqart(norm)
-    norm_tmp = norm_ptr + p * post;
-    for (int i = 0; i < post; ++i) {
-      *norm_tmp = sqrtf(*norm_tmp);
-      norm_tmp++;
-    }
-
-    // out = input / norm
-    in_tmp = input_ptr + p * n * post;
-    float *out_tmp = out_ptr + p * n * post;
-    for (int c = 0; c < n; ++c) {
-      norm_tmp = norm_ptr + p * post;
-      for (int j = 0; j < post; ++j) {
-        *out_tmp = *in_tmp / *norm_tmp;
-        in_tmp++;
-        norm_tmp++;
-        out_tmp++;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h b/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
deleted file mode 100644
index 9cbac1035faf4cdc5109a08ea78dfafa8e1df7f2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/polygon_box_transform_arm_func.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-#pragma once
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void PolygonBoxTransformCompute(const PolygonBoxTransformParam<CPU>& param) {
-  const auto* input = param.Input();
-  const auto& input_dims = input->dims();
-  const auto* input_data = input->data<float>();
-  auto* output = param.Output();
-  auto* output_data = output->mutable_data<float>(input_dims);
-
-  int64_t batch_size = input_dims[0];
-  int64_t geo_channel = input_dims[1];
-  int64_t height = input_dims[2];
-  int64_t width = input_dims[3];
-  int64_t id = 0;
-  for (int64_t id_n = 0; id_n < batch_size * geo_channel; ++id_n) {
-    for (int64_t id_h = 0; id_h < height; ++id_h) {
-      for (int64_t id_w = 0; id_w < width; ++id_w) {
-        id = id_n * height * width + width * id_h + id_w;
-        if (id_n % 2 == 0) {
-          output_data[id] = id_w * 4 - input_data[id];
-        } else {
-          output_data[id] = id_h * 4 - input_data[id];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h b/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
deleted file mode 100644
index 82c24d0ab4ca93ae23218eb7441f37ef2f68efee..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "common/types.h"
-#include "operators/math/pooling.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void PoolCompute(const PoolParam<CPU> &param) {
-  const framework::Tensor *input = param.Input();
-  framework::Tensor *output = param.Output();
-  const std::string &pooling_type = param.PoolingType();
-  std::vector<int> ksize = param.Ksize();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  const bool exclusive = param.isExclusive();
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(input->dims()[i + 2]);
-    }
-  }
-  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling3x3<MAX, 1>()(*input, paddings, exclusive, output);
-      } else if (strides[0] == 2) {
-        math::Pooling3x3<MAX, 2>()(*input, paddings, exclusive, output);
-      } else {
-        math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-      }
-    } else if (pooling_type == "avg" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling3x3<AVG, 1>()(*input, paddings, exclusive, output);
-      } else if (strides[0] == 2) {
-        math::Pooling3x3<AVG, 2>()(*input, paddings, exclusive, output);
-      } else {
-        math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-      }
-    }
-  } else if (ksize[0] == 2 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling2x2<MAX, 1>()(*input, paddings, output);
-      } else if (strides[0] == 2) {
-        math::Pooling2x2<MAX, 2>()(*input, paddings, output);
-      } else {
-        math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-      }
-    } else if (pooling_type == "avg" && strides[0] == strides[1]) {
-      if (strides[0] == 1) {
-        math::Pooling2x2<AVG, 1>()(*input, paddings, output);
-      } else if (strides[0] == 2) {
-        math::Pooling2x2<AVG, 2>()(*input, paddings, output);
-      } else {
-        math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-      }
-    }
-  } else {
-    if (pooling_type == "max") {
-      math::Pooling<MAX>()(*input, ksize, strides, paddings, output);
-    } else if (pooling_type == "avg") {
-      math::Pooling<AVG>()(*input, ksize, strides, paddings, output);
-    } else {
-      // Others
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h b/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
deleted file mode 100644
index e783c52f8184d6e09b04cd5c8210f5b89276541e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/prior_box_arm_func.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRIORBOX_OP
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct ClipFunctor {
-  inline T operator()(T in) const {
-    return std::min<T>(std::max<T>(in, 0.), 1.);
-  }
-};
-
-template <typename P>
-void PriorBoxCompute(const PriorBoxParam<CPU> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto *input_image = param.InputImage();
-  const auto &input_image_dims = input_image->dims();
-
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-
-  Tensor *output_boxes = param.OutputBoxes();
-  auto output_boxes_dataptr = output_boxes->mutable_data<float>();
-  Tensor *output_variances = param.OutputVariances();
-  auto output_variances_dataptr = output_variances->mutable_data<float>();
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  auto stride0 = output_boxes->dims()[1] * output_boxes->dims()[2] *
-                 output_boxes->dims()[3];
-  auto stride1 = output_boxes->dims()[2] * output_boxes->dims()[3];
-  auto stride2 = output_boxes->dims()[3];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  for (int h = 0; h < feature_height; ++h) {
-    for (int w = 0; w < feature_width; ++w) {
-      /// map origin image
-      float center_x = (w + offset) * step_width;
-      float center_y = (h + offset) * step_height;
-      float box_width, box_height;
-      int idx = 0;
-      for (size_t s = 0; s < min_sizes.size(); ++s) {
-        auto min_size = min_sizes[s];
-        if (param.MinMaxAspectRatiosOrder()) {
-          box_width = box_height = min_size / 2.;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 0] =
-              (center_x - box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 1] =
-              (center_y - box_height) / img_height;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 2] =
-              (center_x + box_width) / img_width;
-          output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 + 3] =
-              (center_y + box_height) / img_height;
-          idx++;
-
-          if (max_sizes.size() > 0) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-
-          // priors with different aspect ratios
-          for (float ar : aspect_ratios) {
-            if (fabs(ar - 1.) < 1e-6) {
-              continue;
-            }
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            /// box_width/2 , / img_width 为了得到feature map 相对于
-            /// 原图的归一化位置的比例。
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-
-        } else {
-          // priors with different aspect ratios
-          for (float ar : aspect_ratios) {
-            box_width = min_size * sqrt(ar) / 2.;
-            box_height = min_size / sqrt(ar) / 2.;
-            /// box_width/2 , / img_width 为了得到feature map 相对于
-            /// 原图的归一化位置的比例。
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-          if (!max_sizes.empty()) {
-            auto max_size = max_sizes[s];
-            // square prior with size sqrt(minSize * maxSize)
-            box_width = box_height = sqrt(min_size * max_size) / 2.;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 0] = (center_x - box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 1] = (center_y - box_height) / img_height;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 2] = (center_x + box_width) / img_width;
-            output_boxes_dataptr[h * stride0 + w * stride1 + idx * stride2 +
-                                 3] = (center_y + box_height) / img_height;
-            idx++;
-          }
-        }
-      }
-    }
-  }
-  if (clip) {
-    math::Transform trans;
-    ClipFunctor<float> clip_func;
-    trans(output_boxes_dataptr, output_boxes_dataptr + output_boxes->numel(),
-          output_boxes_dataptr, clip_func);
-  }
-
-  if ((variances.size() != 4)) {
-    LOG(kLOG_ERROR) << " variances.size() must be 4.";
-  }
-
-  int64_t box_num = feature_height * feature_width * num_priors;
-
-  for (int i = 0; i < box_num; i++) {
-    output_variances_dataptr[4 * i] = variances[0];
-    output_variances_dataptr[4 * i + 1] = variances[1];
-    output_variances_dataptr[4 * i + 2] = variances[2];
-    output_variances_dataptr[4 * i + 3] = variances[3];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h b/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
deleted file mode 100644
index c22cf120313b039944932fb4e6cc52aa59a68fd4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/reshape2_arm_func.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-#pragma once
-
-#include <vector>
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void Reshape2Compute(const Reshape2Param<CPU> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  } else {
-    auto &shape = param.Shape();
-    out_dims = ValidateShape(shape, input_x_dims);
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h b/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
deleted file mode 100644
index 6e1a29dee6003ec26d58fd61e7445d74eca85edb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/reshape_arm_func.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-#pragma once
-
-#include <vector>
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ReshapeCompute(const ReshapeParam<CPU> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    auto *shape_data = input_shape->data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    out_dims = ValidateShape(shape, input_x->dims());
-  }
-
-  bool inplace = param.Inplace();
-  out->Resize(out_dims);
-  if (!inplace) {
-    out->mutable_data<float>();
-    framework::TensorCopy(*input_x, out);
-    out->Resize(out_dims);
-  } else {
-    out->ShareDataWith(*input_x);
-    out->Resize(out_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h b/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
deleted file mode 100644
index fa9154211fe24ff8e1cc4966f9684f1fbf5a3111..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/shape_arm_func.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void ShapeCompute(const ShapeParam<CPU>& param) {
-  auto* in_t = param.Input();
-  auto* out_t = param.Out();
-  auto out_data = out_t->mutable_data<int32_t>();
-  auto in_dims = in_t->dims();
-  for (int i = 0; i < in_dims.size(); ++i) {
-    out_data[i] = static_cast<int32_t>(in_dims[i]);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h b/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
deleted file mode 100644
index 29d63937ba59debf75da6ac5c5d31d50ab6abfa7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/softmax_arm_func.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-#pragma once
-#include "../../math/softmax.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-
-void softmax_basic_axis_float(const float *din, float *dout,
-                              const int axis_size, const int inner_num,
-                              const int outer_num) {
-  int compute_size = inner_num * outer_num;
-#pragma omp parallel for
-  for (int i = 0; i < compute_size; ++i) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int real_index = idx_outer * inner_num + idx_inner;
-
-    float max_data = din[real_index];
-    // get max
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      max_data = din[real_index] > max_data ? din[real_index] : max_data;
-    }
-
-    real_index = idx_outer * inner_num + idx_inner;
-    // sub, exp and sum
-    dout[real_index] = expf(din[real_index] - max_data);
-    float sum_data = dout[real_index];
-    for (int j = 1; j < axis_size; ++j) {
-      real_index += inner_num;
-      dout[real_index] = expf(din[real_index] - max_data);
-      sum_data += dout[real_index];
-    }
-
-    float sum_inv = 1.f / sum_data;
-    real_index = idx_outer * inner_num + idx_inner;
-    // get softmax result
-    for (int j = 0; j < axis_size; ++j) {
-      dout[real_index] *= sum_inv;
-      real_index += inner_num;
-    }
-  }
-}
-
-template <typename P>
-void SoftmaxCompute(const SoftmaxParam<CPU> &param) {
-  const Tensor *in_x = param.InputX();
-  Tensor *out = param.Out();
-  auto x_dims = in_x->dims();
-  out->Resize(x_dims);
-  out->mutable_data<float>();
-  if (param.has_axis_) {
-    int axis = param.axis_;
-    int axis_size = x_dims[axis];
-    auto x_rank = x_dims.size();
-    DLOG << "x_rank :" << x_rank;
-
-    if (axis < 0) {
-      axis += x_rank;
-    }
-
-    DLOG << "axis :" << axis;
-
-    int outer_num = framework::product(framework::slice_ddim(x_dims, 0, axis));
-    DLOG << "outer_num :" << outer_num;
-    int inner_num =
-        framework::product(framework::slice_ddim(x_dims, axis + 1, x_rank));
-    DLOG << "inner_num :" << inner_num;
-
-    softmax_basic_axis_float(in_x->data<float>(), out->data<float>(), axis_size,
-                             inner_num, outer_num);
-  } else {
-    math::SoftmaxFuntor<CPU, float>()(in_x, out);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h b/mobile/src/operators/kernel/central-arm-func/split_arm_func.h
deleted file mode 100644
index 24ab2f83a4f3be8b29cb9e33347d639c52f9eea1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/split_arm_func.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-// Strided numel memory copy from src to dst by the specified axis
-//
-// For example, for a tensor dims [4, 20, 100], the strieded numel is
-// [8000, 2000, 100]
-//
-// NOTE: The src and dst tensor should have the same elements
-// except the specified axis.
-template <typename T>
-inline void StridedNumelCopyWithAxis(int64_t axis, T* dst,
-                                     const framework::DDim& dst_stride_numel,
-                                     const T* src,
-                                     const framework::DDim& src_stride_numel,
-                                     int64_t size) {
-  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-  int64_t src_after = src_stride_numel[axis];
-  int64_t dst_after = dst_stride_numel[axis];
-
-  PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
-                        "src and dst tensor should have the same dims size.");
-
-  for (int64_t i = 0; i < axis; ++i) {
-    if (i < axis) {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
-                                dst_stride_numel[i] / dst_stride_numel[axis],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    } else if (i == axis) {
-      continue;
-    } else {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    }
-  }
-
-  for (int64_t i = 0; i < before; ++i) {
-    memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
-  }
-}
-
-template <typename P>
-void SplitCompute(const SplitParam<CPU>& param) {
-  auto* in = param.InputX();
-  auto outs = param.Outs();
-  auto in_stride = framework::stride_numel(in->dims());
-  int64_t axis = param.Axis();
-
-  size_t input_offset = 0;
-  for (auto& out : outs) {
-    out->mutable_data<float>();
-    auto out_stride = framework::stride_numel(out->dims());
-
-    StridedNumelCopyWithAxis<float>(axis, out->data<float>(), out_stride,
-                                    in->data<float>() + input_offset, in_stride,
-                                    out_stride[axis]);
-    input_offset += out_stride[axis];
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h b/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
deleted file mode 100644
index 7d41c898db991c098c91e882fdb330f5ffb2b9bf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/sum_arm_func.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/selected_rows_functor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using LoDTensorArray = std::vector<LoDTensor>;
-
-template <typename P>
-void SumCompute(const SumParam<CPU> &param) {
-  auto inputsvars = param.InputsVars();
-  int N = inputsvars.size();
-  auto *outvar = param.OutVar();
-
-  bool in_place = outvar == inputsvars[0];
-  if (outvar->IsType<framework::LoDTensor>()) {
-    auto *out = outvar->GetMutable<LoDTensor>();
-    if (!in_place) {
-      out->mutable_data<float>();
-    }
-    auto *outptr = out->data<float>();
-    // auto result = Flatten(*out);
-
-    if (!in_place) {
-      std::fill(out->data<float>(), out->data<float>() + out->numel(), 0);
-    }
-    math::SelectedRowsAddToTensor<float> functor;
-    for (int i = in_place ? 1 : 0; i < N; i++) {
-      if (inputsvars[i]->IsType<framework::LoDTensor>()) {
-        auto *in_t = inputsvars[i]->Get<framework::LoDTensor>();
-        auto *inptr = in_t->data<float>();
-        if (in_t->numel() == 0) {
-          continue;
-        }
-        for (int j = 0; j < out->numel(); ++j) {
-          outptr[j] = outptr[j] + inptr[j];
-        }
-
-      } else if (inputsvars[i]->IsType<framework::SelectedRows>()) {
-        auto *in_t = inputsvars[i]->Get<framework::SelectedRows>();
-        functor(*in_t, out);
-      } else {
-        PADDLE_MOBILE_THROW_EXCEPTION(
-            "Variable type must be LoDTensor/SelectedRows.");
-      }
-    }
-
-  } else if (outvar->IsType<framework::SelectedRows>()) {
-    std::unique_ptr<framework::SelectedRows> in0;
-    if (in_place) {
-      // If is in_place, we store the input[0] to in0
-      auto *in_sel0 = inputsvars[0]->Get<framework::SelectedRows>();
-      auto &rows = in_sel0->rows();
-      in0.reset(new framework::SelectedRows(rows, in_sel0->height()));
-      in0->mutable_value()->ShareDataWith(in_sel0->value());
-    }
-
-    auto get_selected_row = [&](size_t i) -> const framework::SelectedRows & {
-      if (i == 0 && in0) {
-        return *in0.get();
-      } else {
-        return *(inputsvars[i]->Get<framework::SelectedRows>());
-      }
-    };
-
-    auto *out = outvar->GetMutable<framework::SelectedRows>();
-    out->mutable_rows()->clear();
-    auto *out_value = out->mutable_value();
-
-    // Runtime InferShape
-    size_t first_dim = 0;
-    for (int i = 0; i < N; i++) {
-      auto &sel_row = get_selected_row(i);
-      first_dim += sel_row.rows().size();
-    }
-    auto in_dim = framework::vectorize(get_selected_row(N - 1).value().dims());
-    in_dim[0] = static_cast<int64_t>(first_dim);
-
-    out_value->Resize(framework::make_ddim(in_dim));
-
-    // if all the input sparse vars are empty, no need to
-    // merge these vars.
-    if (first_dim == 0UL) {
-      return;
-    }
-    out_value->mutable_data<float>();
-    math::SelectedRowsAddTo<float> functor;
-
-    int64_t offset = 0;
-    for (int i = 0; i < N; i++) {
-      auto &sel_row = get_selected_row(i);
-      if (sel_row.rows().size() == 0) {
-        continue;
-      }
-      PADDLE_MOBILE_ENFORCE(out->height() == sel_row.height(),
-                            "seletrows height != outheight");
-      functor(sel_row, offset, out);
-      offset += sel_row.value().numel();
-    }
-  } else if (outvar->IsType<LoDTensorArray>()) {
-    auto &out_array = *outvar->GetMutable<LoDTensorArray>();
-    for (size_t i = in_place ? 1 : 0; i < inputsvars.size(); ++i) {
-      PADDLE_MOBILE_ENFORCE(inputsvars[i]->IsType<LoDTensorArray>(),
-                            "Only support all inputs are TensorArray");
-      auto *in_array = inputsvars[i]->Get<LoDTensorArray>();
-
-      for (size_t i = 0; i < in_array->size(); ++i) {
-        if ((*in_array)[i].numel() != 0) {
-          if (i >= out_array.size()) {
-            out_array.resize(i + 1);
-          }
-          if (out_array[i].numel() == 0) {
-            framework::TensorCopy((*in_array)[i], &out_array[i]);
-            out_array[i].set_lod((*in_array)[i].lod());
-          } else {
-            PADDLE_MOBILE_ENFORCE(out_array[i].lod() == (*in_array)[i].lod(),
-                                  "outLod != inLod");
-            auto *inptr = (*in_array)[i].data<float>();
-            auto *outptr = out_array[i].data<float>();
-
-            for (int j = 0; j < (*in_array)[i].numel(); ++j) {
-              outptr[j] = inptr[j] + outptr[j];
-            }
-          }
-        }
-      }
-    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(
-        "Unexpected branch, output variable type is %d", outvar->Type());
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h b/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
deleted file mode 100644
index ef3d38eff23a44accc7ab71eb2095ff4a78c1571..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/central-arm-func/transpose_arm_func.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-#pragma once
-
-#include <vector>
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename P>
-void TransposeCompute(const TransposeParam<CPU>& param) {
-  const auto* input_x = param.InputX();
-  const auto input_x_dims = input_x->dims();
-  auto* out = param.Out();
-  const auto axis = param.Axis();
-  const auto* input_x_data = input_x->data<float>();
-  auto* out_data = out->mutable_data<float>();
-
-  size_t ndim = axis.size();
-  std::vector<int> xdim(ndim);
-  std::vector<int> xstride(ndim);
-  std::vector<int> xout(ndim);
-  for (int i = 0; i < ndim; i++) {
-    int j = ndim - 1 - i;
-    xdim[j] = input_x_dims[axis[i]];
-    xstride[j] = 1;
-    for (int k = axis[i] + 1; k < ndim; k++) {
-      xstride[j] *= input_x_dims[k];
-    }
-    xout[j] = xstride[j] * xdim[j];
-  }
-
-  auto numel = input_x->numel();
-  size_t pind = 0;
-  std::vector<int> ind(ndim);
-  for (int i = 0; i < numel; i++) {
-    out_data[i] = input_x_data[pind];
-    ind[0]++;
-    pind += xstride[0];
-    for (int j = 0; j < ndim - 1; j++) {
-      if (ind[j] == xdim[j]) {
-        ind[j + 1]++;
-        ind[j] = 0;
-        pind += xstride[j + 1];
-        pind -= xout[j];
-      } else {
-        break;
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp b/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
deleted file mode 100644
index 6e5039cf050b04c681f738f06f8cd3baf2ccbb75..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BATCHNORM_OP
-
-#include "operators/kernel/batchnorm_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BatchNormKernel<GPU_CL, float>::Init(BatchNormParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("batchnorm", "batchnorm_kernel.cl");
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  const int C = mean->numel();
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         this->cl_helper_.CLCommandQueue());
-
-  framework::CLImage *new_bias = new framework::CLImage();
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        this->cl_helper_.CLCommandQueue());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  return true;
-}
-
-template <>
-void BatchNormKernel<GPU_CL, float>::Compute(
-    const BatchNormParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputY());
-
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.OutputY()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  const int out_width = default_work_size[1];
-  DLOG << *param.InputX();
-  DLOG << *param.NewBias();
-  DLOG << *param.NewScale();
-  DLOG << default_work_size[0];
-  DLOG << default_work_size[1];
-  DLOG << default_work_size[2];
-  DLOG << out_width;
-  DLOG << *param.OutputY();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class BatchNormKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp b/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp
deleted file mode 100644
index 362cf5bb25ac43981aa80ebe6e683d5471fa9d89..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/bilinear_interp_kernel.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BILINEAR_INTERP_OP
-
-#include <operators/kernel/bilinear_interp_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool BilinearInterpKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::BilinearInterpParam<paddle_mobile::GPU_CL>
-        *param) {
-  this->cl_helper_.AddKernel("bilinear_interp", "bilinear_interp_kernel.cl");
-  return true;
-}
-
-template <>
-void BilinearInterpKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::BilinearInterpParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto input = param.InputX();
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.Out();
-  cl_mem output_image = output->GetCLImage();
-  float scale_h, scale_w;
-  if (param.AlignCorners()) {
-    scale_h = (input->dims()[2] - 1.0f) / (output->dims()[2] - 1.0f);
-    scale_w = (input->dims()[3] - 1.0f) / (output->dims()[3] - 1.0f);
-  } else {
-    scale_h = input->dims()[2] / static_cast<float>(output->dims()[2]);
-    scale_w = input->dims()[3] / static_cast<float>(output->dims()[3]);
-  }
-  float align_delta = 0.0f;
-  if (!param.AlignCorners() && param.AlignMode() == 0) {
-    align_delta = 0.5f;
-  }
-  int in_dims_h = input->dims()[2];
-  int out_dims_h = output->dims()[2];
-  int in_dims_w = input->dims()[3];
-  int out_dims_w = output->dims()[3];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 5, sizeof(int), &out_dims_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_dims_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 7, sizeof(int), &out_dims_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 8, sizeof(float), &align_delta);
-  CL_CHECK_ERRORS(status)
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status)
-}
-template class BilinearInterpKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp b/mobile/src/operators/kernel/cl/box_coder_kernel.cpp
deleted file mode 100644
index b98435f9b09864d8ced90955c6fd3167c770bf31..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/box_coder_kernel.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef BOXCODER_OP
-
-#include "operators/kernel/box_coder_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool BoxCoderKernel<GPU_CL, float>::Init(BoxCoderParam<GPU_CL>* param) {
-  if (param->CodeType() == "decode_center_size") {
-    this->cl_helper_.AddKernel("box_decoder", "box_coder_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void BoxCoderKernel<GPU_CL, float>::Compute(
-    const BoxCoderParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.OutputBox());
-  const auto* input_priorbox = param.InputPriorBox();
-  const auto* input_priorboxvar = param.InputPriorBoxVar();
-  const auto* input_targetbox = param.InputTargetBox();
-  const auto& code_type = param.CodeType();
-  if (code_type == "decode_center_size") {
-    auto prior_box_image = input_priorbox->GetCLImage();
-    auto prior_box_var_image = input_priorboxvar->GetCLImage();
-    auto target_box_image = input_targetbox->GetCLImage();
-    auto output_image = param.OutputBox()->GetCLImage();
-    auto& outputDim = param.OutputBox()->dims();
-    int new_dims[4] = {1, 1, 1, 1};
-    for (int i = 0; i < outputDim.size(); i++) {
-      new_dims[4 - outputDim.size() + i] = outputDim[i];
-    }
-    int out_C = new_dims[1];
-    int out_H = new_dims[2];
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &prior_box_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &prior_box_var_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &target_box_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    size_t global_work_size[2] = {default_work_size[0], default_work_size[2]};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
deleted file mode 100644
index a4dfd8321edbcc24b1d942bbe55abbdddba009c1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ /dev/null
@@ -1,1140 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef CONV_OP
-
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-#include <vector>
-#include "framework/cl/cl_image_converter.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-bool use_lws = true;
-int preferred_lws = 0;
-int preferred_lws_divisor = 2;
-
-template <>
-void winograd_transform_weight<4, 3>(framework::CLHelper *cl_helper,
-                                     framework::CLImage *weight) {}
-
-template <>
-void WinogradConv3x3<4, 3>(framework::CLHelper *cl_helper,
-                           const ConvParam<GPU_CL> &param, bool ifRelu,
-                           const framework::CLImage *biase,
-                           const framework::CLImage *new_scale,
-                           const framework::CLImage *new_bias) {}
-
-void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
-                        const ConvParam<GPU_CL> &param, bool ifRelu,
-                        const framework::CLImage *biase,
-                        const framework::CLImage *new_scale,
-                        const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  default_work_size[1] = (default_work_size[1] + 1) / 2;
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-  int output_c = param.Output()->dims()[1];
-  int filter_channel = param.Filter()->dims()[1];
-  int input_channel = param.Input()->dims()[1];
-  //
-  //    DLOG << " c block " << c_block;
-  //    DLOG << " w " << w;
-  //    DLOG << " nh " << nh;
-  //    DLOG << " stride " << stride;
-  //    DLOG << " offset " << offset;
-  //    DLOG << " input_c " << input_c;
-  //    DLOG << " dilation " << dilation;
-  //    DLOG << " input width " << input_width;
-  //    DLOG << " input height " << input_height;
-  //    DLOG << " output width " << output_width;
-  //    DLOG << " output height " << output_height;
-  //    DLOG << " input dim " << param.Input()->dims();
-  //    DLOG << " output dim " << param.Output()->dims();
-  //    DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  if (param.Filter()->dims()[2] == 3 && param.Filter()->dims()[3] == 3) {
-    if (filter_channel != input_channel) {
-      if (filter_channel != 1) {
-        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-        CL_CHECK_ERRORS(status);
-        int has_group = 1;
-        status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
-        CL_CHECK_ERRORS(status);
-      }
-    } else {
-      status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-      CL_CHECK_ERRORS(status);
-      int has_group = 0;
-      status = clSetKernelArg(kernel, index++, sizeof(int), &has_group);
-      CL_CHECK_ERRORS(status);
-    }
-  }
-  //  DLOG<<"default_work_size"<<default_work_size[0]<<"
-  //  "<<default_work_size[1]<<" "<<default_work_size[2];
-  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
-  auto tmp0 = default_work_size.data()[0];
-  auto tmp1 = default_work_size.data()[1];
-  auto tmp2 = default_work_size.data()[2];
-  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
-  if (preferred_lws_divisor > 1) {
-    max_work_size /= preferred_lws_divisor;
-  }
-  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
-    max_work_size = preferred_lws;
-  }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
-  }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
-  }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
-  }
-  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
-                                     static_cast<const uint32_t>(tmp1),
-                                     static_cast<const uint32_t>(tmp2)};
-  if (max_work_size > 0 && use_lws) {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-
-void ConvAddBnRelu(framework::CLHelper *cl_helper,
-                   const ConvParam<GPU_CL> &param, bool ifRelu,
-                   const framework::CLImage *biase,
-                   const framework::CLImage *new_scale,
-                   const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int input_c_origin = param.Input()->dims()[1];
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-  int output_c = param.Output()->dims()[1];
-  int filter_channel = param.Filter()->dims()[1];
-  int input_channel = param.Input()->dims()[1];
-
-  //  DLOG << " c block " << c_block;
-  //  DLOG << " w " << w;
-  //  DLOG << " nh " << nh;
-  //  DLOG << " stride " << stride;
-  //  DLOG << " offset " << offset;
-  //  DLOG << " input_c " << input_c;
-  //  DLOG << " dilation " << dilation;
-  //  DLOG << " input width " << input_width;
-  //  DLOG << " input height " << input_height;
-  //  DLOG << " output width " << output_width;
-  //  DLOG << " output height " << output_height;
-  //  DLOG << " input dim " << param.Input()->dims();
-  //  DLOG << " output dim " << param.Output()->dims();
-  //  DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-  int index = 0;
-
-  const int filter_height = param.Filter()->dims()[2];
-  const int filter_width = param.Filter()->dims()[3];
-  if (filter_height == 1 && filter_width == 1) {
-    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    int maped_w = maptofactor(w, 4);
-    status = clSetKernelArg(kernel, index++, sizeof(int), &maped_w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    if (biase) {
-      auto bias_mem = biase->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    if (new_scale && new_bias) {
-      auto new_scale_mem = new_scale->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-      CL_CHECK_ERRORS(status);
-
-      auto new_bias_mem = new_bias->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c_origin);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    const size_t work_size[3] = {
-        static_cast<const uint32_t>(default_work_size.data()[0]),
-        static_cast<const uint32_t>(maped_w),
-        static_cast<const uint32_t>(default_work_size.data()[2])};
-
-    auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
-    auto tmp0 = work_size[0];
-    auto tmp1 = work_size[1];
-    auto tmp2 = work_size[2];
-    int max_work_size = static_cast<const uint32_t>(kernel_work_size);
-    if (preferred_lws_divisor > 1) {
-      max_work_size /= preferred_lws_divisor;
-    }
-    if (preferred_lws > 0 && preferred_lws <= max_work_size) {
-      max_work_size = preferred_lws;
-    }
-    while (tmp1 > max_work_size && max_work_size > 0) {
-      tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
-    }
-    while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-      tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
-    }
-    while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-      tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
-    }
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
-                                       static_cast<const uint32_t>(tmp1),
-                                       static_cast<const uint32_t>(tmp2)};
-    if (max_work_size > 0 && use_lws) {
-      status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
-                                      default_work_size.size(), NULL, work_size,
-                                      local_work_size, 0, NULL, NULL);
-    } else {
-      status = clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel,
-                                      default_work_size.size(), NULL, work_size,
-                                      NULL, 0, NULL, NULL);
-    }
-    CL_CHECK_ERRORS(status);
-  } else {
-    status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &w);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-    CL_CHECK_ERRORS(status);
-
-    if (biase) {
-      auto bias_mem = biase->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    if (new_scale && new_bias) {
-      auto new_scale_mem = new_scale->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-      CL_CHECK_ERRORS(status);
-
-      auto new_bias_mem = new_bias->GetCLImage();
-      status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-      CL_CHECK_ERRORS(status);
-    }
-
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &offset);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_c);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-    CL_CHECK_ERRORS(status);
-
-    status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-    CL_CHECK_ERRORS(status);
-
-    if (filter_height == 3 && filter_width == 3) {
-      // normal conv
-      if (param.Filter()->dims()[0] == param.Output()->dims()[1] &&
-          param.Filter()->dims()[1] == param.Input()->dims()[1]) {
-        status = clSetKernelArg(kernel, index++, sizeof(int), &output_c);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-        CL_CHECK_ERRORS(status);
-        int group = 1;
-        status = clSetKernelArg(kernel, index++, sizeof(int), &group);
-        CL_CHECK_ERRORS(status);
-      } else if (!(param.Filter()->dims()[0] == param.Input()->dims()[1] &&
-                   param.Filter()->dims()[1] == 1)) {  // not depwise
-        status = clSetKernelArg(kernel, index++, sizeof(int), &output_c);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_channel);
-        CL_CHECK_ERRORS(status);
-        int group = input_channel / filter_channel;
-        status = clSetKernelArg(kernel, index++, sizeof(int), &group);
-        CL_CHECK_ERRORS(status);
-      }
-    } else if (filter_height != 3 && filter_width != 3) {
-      // not 3x3
-      if (param.Filter()->dims()[1] == 1 &&
-          param.Input()->dims()[1] == param.Output()->dims()[1]) {
-        // deepwise basic use in not 3x3
-        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
-        CL_CHECK_ERRORS(status);
-
-        status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
-        CL_CHECK_ERRORS(status);
-      }
-    }
-
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-void DWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu,
-                     const framework::CLImage *biase,
-                     const framework::CLImage *new_scale,
-                     const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  int w_blk_size = 2;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-  default_work_size[1] = w_blk;
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  //  DLOG << " w " << w;
-  //  DLOG << " nh " << nh;
-  //  DLOG << " stride " << stride;
-  //  DLOG << " dilation " << dilation;
-  //  DLOG << " input width " << input_width;
-  //  DLOG << " input height " << input_height;
-  //  DLOG << " output width " << output_width;
-  //  DLOG << " output height " << output_height;
-  //  DLOG << " input dim " << param.Input()->dims();
-  //  DLOG << " output dim " << param.Output()->dims();
-  //  DLOG << " filter dim " << param.Filter()->dims();
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
-  auto tmp0 = default_work_size.data()[0];
-  auto tmp1 = default_work_size.data()[1];
-  auto tmp2 = default_work_size.data()[2];
-  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
-  if (preferred_lws_divisor > 1) {
-    max_work_size /= preferred_lws_divisor;
-  }
-  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
-    max_work_size = preferred_lws;
-  }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
-  }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
-  }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
-  }
-  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
-                                     static_cast<const uint32_t>(tmp1),
-                                     static_cast<const uint32_t>(tmp2)};
-  if (max_work_size > 0 && use_lws) {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-
-  CL_CHECK_ERRORS(status);
-}
-
-void SWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu,
-                     const framework::CLImage *biase,
-                     const framework::CLImage *new_scale,
-                     const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
-  auto tmp0 = default_work_size.data()[0];
-  auto tmp1 = default_work_size.data()[1];
-  auto tmp2 = default_work_size.data()[2];
-  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
-  if (preferred_lws_divisor > 1) {
-    max_work_size /= preferred_lws_divisor;
-  }
-  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
-    max_work_size = preferred_lws;
-  }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
-  }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
-  }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
-  }
-  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
-                                     static_cast<const uint32_t>(tmp1),
-                                     static_cast<const uint32_t>(tmp2)};
-  if (max_work_size > 0 && use_lws) {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-
-void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                              const ConvTransposeParam<GPU_CL> &param,
-                              bool ifRelu, const framework::CLImage *biase,
-                              const framework::CLImage *new_scale,
-                              const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 1;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  int filter_height = param.Filter()->dims()[2];
-  int filter_width = param.Filter()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
-  CL_CHECK_ERRORS(status);
-
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-
-void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper,
-                              const ConvTransposeParam<GPU_CL> &param,
-                              bool ifRelu, const framework::CLImage *biase,
-                              const framework::CLImage *new_scale,
-                              const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  const auto *input = param.Input();
-  auto *output = param.Output();
-  auto *filter = param.Filter();
-  const int n = input->dims()[0];
-  const int input_c = input->dims()[1];
-  const int input_c_block = (input_c + 3) / 4;
-  const int input_width = input->dims()[3];
-  const int input_height = input->dims()[2];
-  const int output_c = output->dims()[1];
-  const int output_c_block = (output_c + 3) / 4;
-  const int output_width = output->dims()[3];
-  const int output_height = output->dims()[2];
-
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  auto filterImage = filter->GetCLImage();
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &input_c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &filterImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-
-  const size_t work_size[3] = {(size_t)output_c_block, (size_t)input_width,
-                               (size_t)(n * input_height)};
-
-  DLOG << "conv transpose " << input_c_block << input_width << input_height
-       << output_width << output_height << work_size[0] << work_size[1]
-       << work_size[2];
-
-  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
-                         work_size, NULL, 0, NULL, NULL);
-}
-void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                            const ConvTransposeParam<GPU_CL> &param,
-                            bool ifRelu, const framework::CLImage *biase,
-                            const framework::CLImage *new_scale,
-                            const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 1;
-  int w_blk = (w + w_blk_size - 1) / w_blk_size;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  int filter_height = param.Filter()->dims()[2];
-  int filter_width = param.Filter()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
-  CL_CHECK_ERRORS(status);
-
-  if (default_work_size.data()[1] % 60 == 0 && use_lws) {
-    const size_t local_work_size[3] = {static_cast<const uint32_t>(1),
-                                       static_cast<const uint32_t>(60),
-                                       static_cast<const uint32_t>(1)};
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
-                                 const ConvTransposeParam<GPU_CL> &param,
-                                 bool ifRelu, const framework::CLImage *biase,
-                                 const framework::CLImage *new_scale,
-                                 const framework::CLImage *new_bias) {
-  auto kernel = cl_helper->KernelAt(0);
-  auto default_work_size = cl_helper->DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-
-  int w_blk_size = 5;
-  int w_blk = (w + w_blk_size - 1 + 5) / w_blk_size / 2 * 2;
-  default_work_size[1] = w_blk;
-
-  int h_blk_size = 1;
-  int h_blk = (nh + h_blk_size - 1) / h_blk_size;
-  default_work_size[2] = h_blk;
-
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int pad = param.Paddings()[0];
-  int dilation = param.Dilations()[0];
-
-  int input_channel = param.Input()->dims()[1];
-  int input_height = param.Input()->dims()[2];
-  int input_width = param.Input()->dims()[3];
-
-  int output_height = param.Output()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-
-  int filter_height = param.Filter()->dims()[2];
-  int filter_width = param.Filter()->dims()[3];
-
-  cl_int status;
-  int index = 0;
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &w_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &h_blk);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  if (biase) {
-    auto bias_mem = biase->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  if (new_scale && new_bias) {
-    auto new_scale_mem = new_scale->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_scale_mem);
-    CL_CHECK_ERRORS(status);
-
-    auto new_bias_mem = new_bias->GetCLImage();
-    status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &new_bias_mem);
-    CL_CHECK_ERRORS(status);
-  }
-
-  status = clSetKernelArg(kernel, index++, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &pad);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, index++, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_channel);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, index++, sizeof(int), &filter_height);
-  CL_CHECK_ERRORS(status);
-
-  auto kernel_work_size = cl_helper->KernelWorkSize(kernel);
-  auto tmp0 = default_work_size.data()[0];
-  auto tmp1 = default_work_size.data()[1];
-  auto tmp2 = default_work_size.data()[2];
-  int max_work_size = static_cast<const uint32_t>(kernel_work_size);
-  if (preferred_lws_divisor > 1) {
-    max_work_size /= preferred_lws_divisor;
-  }
-  if (preferred_lws > 0 && preferred_lws <= max_work_size) {
-    max_work_size = preferred_lws;
-  }
-  while (tmp1 > max_work_size && max_work_size > 0) {
-    tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
-  }
-  while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
-    tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
-  }
-  while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
-    tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
-  }
-  const size_t local_work_size[3] = {static_cast<const uint32_t>(tmp0),
-                                     static_cast<const uint32_t>(tmp1),
-                                     static_cast<const uint32_t>(tmp2)};
-  if (max_work_size > 0 && use_lws) {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), local_work_size, 0, NULL, NULL);
-  } else {
-    status = clEnqueueNDRangeKernel(
-        cl_helper->CLCommandQueue(), kernel, default_work_size.size(), NULL,
-        default_work_size.data(), NULL, 0, NULL, NULL);
-  }
-  CL_CHECK_ERRORS(status);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
deleted file mode 100644
index a2488aaa2def03eb3d2165c8720177651ff5e5e5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(CONV_OP) || defined(CONV_TRANSPOSE_OP)
-
-#pragma once
-
-#include "framework/cl/cl_helper.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline int maptofactor(int i, int factor) { return (i + factor - 1) / factor; }
-
-template <int tile, int kernel>
-void winograd_transform_weight(framework::CLHelper *cl_helper,
-                               framework::CLImage *weight);
-
-template <int tile, int kernel>
-void WinogradConv3x3(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                     const framework::CLImage *biase = nullptr,
-                     const framework::CLImage *new_scale = nullptr,
-                     const framework::CLImage *new_bias = nullptr);
-
-void ConvAddBnRelu(framework::CLHelper *cl_helper,
-                   const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                   const framework::CLImage *biase = nullptr,
-                   const framework::CLImage *new_scale = nullptr,
-                   const framework::CLImage *new_bias = nullptr);
-
-void ConvAddBnReluPt1x2(framework::CLHelper *cl_helper,
-                        const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                        const framework::CLImage *biase = nullptr,
-                        const framework::CLImage *new_scale = nullptr,
-                        const framework::CLImage *new_bias = nullptr);
-
-void DWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                     const framework::CLImage *biase = nullptr,
-                     const framework::CLImage *new_scale = nullptr,
-                     const framework::CLImage *new_bias = nullptr);
-
-void SWConvAddBnRelu(framework::CLHelper *cl_helper,
-                     const ConvParam<GPU_CL> &param, bool ifRelu = false,
-                     const framework::CLImage *biase = nullptr,
-                     const framework::CLImage *new_scale = nullptr,
-                     const framework::CLImage *new_bias = nullptr);
-void DWConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                              const ConvTransposeParam<GPU_CL> &param,
-                              bool ifRelu = false,
-                              const framework::CLImage *biase = nullptr,
-                              const framework::CLImage *new_scale = nullptr,
-                              const framework::CLImage *new_bias = nullptr);
-void ConvTransposeAddBnRelu(framework::CLHelper *cl_helper,
-                            const ConvTransposeParam<GPU_CL> &param,
-                            bool ifRelu = false,
-                            const framework::CLImage *biase = nullptr,
-                            const framework::CLImage *new_scale = nullptr,
-                            const framework::CLImage *new_bias = nullptr);
-void ConvTransposeAddBnRelu_b(framework::CLHelper *cl_helper,
-                              const ConvTransposeParam<GPU_CL> &param,
-                              bool ifRelu = false,
-                              const framework::CLImage *biase = nullptr,
-                              const framework::CLImage *new_scale = nullptr,
-                              const framework::CLImage *new_bias = nullptr);
-void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
-                                 const ConvTransposeParam<GPU_CL> &param,
-                                 bool ifRelu = false,
-                                 const framework::CLImage *biase = nullptr,
-                                 const framework::CLImage *new_scale = nullptr,
-                                 const framework::CLImage *new_bias = nullptr);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
deleted file mode 100644
index 1f25d3436e6668bf3c01a8dbe3484dbb92fa921d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef INSTANCENORM_OP
-#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
-#include <algorithm>
-namespace paddle_mobile {
-namespace operators {
-void InstanceNorm(framework::CLHelper *cl_helper,
-                  const framework::CLImage *input, framework::CLImage *output,
-                  float epsilon) {
-  auto kernel = cl_helper->KernelAt(0);
-
-  auto &dims = output->dims();
-  const int n = dims[0];
-  const int c_group = (dims[1] + 3) / 4;
-  const int h = dims[2];
-  const int w = dims[3];
-  auto input_image = input->GetCLImage();
-  auto out_image = output->GetCLImage();
-
-  //      DLOG << "Epsilon: " << epsilon;
-
-  auto local_work_size_info = cl_helper->LocalWorkSizeInfo();
-  //
-  //      DLOG << local_work_size_info.max_work_group_size;
-  //      DLOG << local_work_size_info.max_work_item_size0;
-  //      DLOG << local_work_size_info.max_work_item_size1;
-  //      DLOG << local_work_size_info.max_work_item_size2;
-  int maxTotal =
-      std::min(static_cast<int>(local_work_size_info.max_work_group_size), 256);
-  int local_work_size1 =
-      std::min(static_cast<int>(local_work_size_info.max_work_item_size1),
-               std::min(256, w));
-  int local_work_size2 = 1;
-  const size_t work_size[3] = {(size_t)(n * c_group), (size_t)local_work_size1,
-                               (size_t)local_work_size2};
-  const size_t local_work_size[3] = {(size_t)1, (size_t)local_work_size1,
-                                     (size_t)local_work_size2};
-
-  //      DLOG << "work_size" << work_size[0] << " " << work_size[1] << " "
-  //           << work_size[2];
-  //      DLOG << "local_work_size" << local_work_size[0] << " " <<
-  //      local_work_size[1]
-  //           << " " << local_work_size[2];
-  cl_int status;
-  clSetKernelArg(kernel, 0, sizeof(cl_int), &w);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 1, sizeof(cl_int), &h);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 2, sizeof(cl_int), &c_group);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 3, sizeof(cl_int), &local_work_size1);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 4, sizeof(cl_int), &local_work_size2);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
-  CL_CHECK_ERRORS(status);
-  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
-                         work_size, local_work_size, 0, NULL, NULL);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
deleted file mode 100644
index 1e46ebf4ba497b44699a33adf27dd21830e1e3a4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(INSTANCENORM_OP) || defined(FUSION_INSTANCENORM_RELU_OP)
-
-#pragma once
-
-#include "framework/cl/cl_helper.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-void InstanceNorm(framework::CLHelper *cl_helper,
-                  const framework::CLImage *input, framework::CLImage *output,
-                  float epsilon);
-}
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
deleted file mode 100644
index 9d0857a45e0766482e2dbb6ded77edb07517bc0f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/batchnorm_kernel.cl
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void batchnorm(__private const int out_width,
-                        __read_only image2d_t input,
-                        __read_only image2d_t new_scale_image,
-                        __read_only image2d_t new_bias_image,
-                        __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 new_scale = read_imageh(new_scale_image, sampler, (int2)(out_c, 0));
-  half4 new_bias = read_imageh(new_bias_image, sampler, (int2)(out_c, 0));
-
-  int pos_x = mad24(out_c, out_width, out_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
-  half4 out = mad(in, new_scale, new_bias);
-
-  write_imageh(output, (int2)(pos_x, out_nh), out);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
deleted file mode 100644
index fa504a6ed19503553be99180fc2a748e3f59643a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/bilinear_interp_kernel.cl
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void bilinear_interp(
-    __read_only image2d_t input, __write_only image2d_t output,
-    __private const float scale_h, __private const float scale_w,
-    __private const int in_dims_h, __private const int out_dims_h,
-    __private const int in_dims_w, __private const int out_dims_w,
-    __private const float align_delta) {
-  const int c = get_global_id(0);
-  const int w = get_global_id(1);
-  const int nh = get_global_id(2);
-
-  int2 output_pos;
-  output_pos.x = c * out_dims_w + w;
-  output_pos.y = nh;
-
-  // calculate center pixel's pos
-  int out_n = nh / out_dims_h;
-  int out_h = nh % out_dims_h;
-  float center_w = (w + align_delta) * scale_w - align_delta;
-  float center_h = (out_h + align_delta) * scale_h - align_delta;
-
-  int floor_w = (int)center_w;
-  int floor_h = (int)center_h;
-  int ceil_w = floor_w + 1;
-  int ceil_h = floor_h + 1;
-
-  if (ceil_w > in_dims_w) {
-    ceil_w = floor_w;
-  }
-  if (ceil_h > in_dims_h) {
-    ceil_h = floor_h;
-  }
-  float wight0_w = center_w - floor_w;
-  float wight0_h = center_h - floor_h;
-  float wight1_w = 1.0f - wight0_w;
-  float wight1_h = 1.0f - wight0_h;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  // get left up pixel data
-  int2 left_up;
-  left_up.x = c * in_dims_w + floor_w;
-  left_up.y = out_n * in_dims_h + ceil_h;
-  half4 left_up_data = read_imageh(input, sampler, left_up);
-
-  // get left down pixel data
-  int2 left_down;
-  left_down.x = c * in_dims_w + floor_w;
-  left_down.y = out_n * in_dims_h + floor_h;
-  half4 left_down_data = read_imageh(input, sampler, left_down);
-
-  // get right up pixel data
-  int2 right_up;
-  right_up.x = c * in_dims_w + ceil_w;
-  right_up.y = out_n * in_dims_h + ceil_h;
-  half4 right_up_data = read_imageh(input, sampler, right_up);
-
-  // get right down pixel's data
-  int2 right_down;
-  right_down.x = c * in_dims_w + ceil_w;
-  right_down.y = out_n * in_dims_h + floor_h;
-  half4 right_down_data = read_imageh(input, sampler, right_down);
-
-  // calculate output data
-  half4 data =
-      (left_down_data * (half)wight1_w + right_down_data * (half)wight0_w) *
-          (half)wight1_h +
-      (left_up_data * (half)wight1_w + right_up_data * (half)wight0_w) *
-          (half)wight0_h;
-
-  write_imageh(output, output_pos, data);
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
deleted file mode 100644
index 60000c994ecbe421d1c951f14077d764d3665d4d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/box_coder_kernel.cl
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void box_decoder(__read_only image2d_t prior_box_image,
-                      __read_only image2d_t prior_box_var_image,
-                      __read_only image2d_t target_box_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_C,
-                      __private const int out_H
-                      ){
-                       const int out_c = get_global_id(0);
-                       const int out_nh = get_global_id(1);
-                       const int out_h = out_nh%out_H;
-                       const int out_n =  1;
-
-                       const int prior_box_n = 1;
-                       const int prior_box_c = 0;
-                       const int prior_box_h = out_h;
-
-
-                       const int prior_box_var_n = 1;
-                       const int prior_box_var_c = 0;
-                       const int prior_box_var_h = out_h;
-
-                       const int target_box_n = 1;
-                       const int target_box_c = out_c;
-                       const int target_box_h = out_h;
-
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                 CLK_ADDRESS_CLAMP      |
-                                                 CLK_FILTER_NEAREST;
-                       int2  prior_box_pos;
-                       int2  prior_box_var_pos;
-                       int2  target_box_pos;
-                       int2  output_pos;
-
-                       prior_box_pos.x = prior_box_c * 4;
-                       prior_box_pos.y = prior_box_n * prior_box_h;
-
-                       prior_box_var_pos.x = prior_box_var_c * 4;
-                       prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
-
-                       target_box_pos.x = target_box_c * 4;
-                       target_box_pos.y = target_box_n * target_box_h;
-
-                       output_pos.x = out_c * 4;
-                       output_pos.y = out_n * out_h;
-
-                       half4 prior_box_input[4];
-                       half4 prior_box_var_input[4];
-                       half4 target_box_input[4];
-
-                       prior_box_input[0] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 0,prior_box_pos.y));
-                       prior_box_input[1] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 1,prior_box_pos.y));
-                       prior_box_input[2] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 2,prior_box_pos.y));
-                       prior_box_input[3] = read_imageh(prior_box_image, sampler,(int2)(prior_box_pos.x + 3,prior_box_pos.y));
-
-                       prior_box_var_input[0] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 0,prior_box_var_pos.y));
-                       prior_box_var_input[1] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 1,prior_box_var_pos.y));
-                       prior_box_var_input[2] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 2,prior_box_var_pos.y));
-                       prior_box_var_input[3] = read_imageh(prior_box_var_image, sampler,(int2)(prior_box_var_pos.x + 3,prior_box_var_pos.y));
-
-
-
-                       target_box_input[0] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 0,target_box_pos.y));
-                       target_box_input[1] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 1,target_box_pos.y));
-                       target_box_input[2] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 2,target_box_pos.y));
-                       target_box_input[3] = read_imageh(target_box_image, sampler,(int2)(target_box_pos.x + 3,target_box_pos.y));
-
-                       half prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
-                       half prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
-                       half prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(half)2;
-                       half prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(half)2;
-
-                       half4 target_box_center_x;
-                       half4 target_box_center_y;
-                       half4 target_box_width;
-                       half4 target_box_height;
-                       half4 output[4];
-
-                       output[0] = 0.0f;
-                       output[1] = 0.0f;
-                       output[2] = 0.0f;
-                       output[3] = 0.0f;
-
-                       target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
-                       target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
-                       target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
-                       target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
-
-                       output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
-                       output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
-                       output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
-                       output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
-
-                       if(out_C - out_c * 4 >= 2){
-                       target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
-                       target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
-                       target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
-                       target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
-                       output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
-                       output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
-                       output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
-                       output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
-
-                       }
-                       if(out_C - out_c * 4 >= 3){
-                       target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
-                       target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
-                       target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
-                       target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
-                       output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
-                       output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
-                       output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
-                       output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
-                       }
-                       if(out_C - out_c * 4 >= 4){
-                       target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
-                       target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
-                       target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
-                       target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
-                       output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
-                       output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
-                       output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
-                       output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
-                       }
-
-
-                       write_imageh(output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
-                       write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
-                       write_imageh(output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
-                       write_imageh(output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
deleted file mode 100644
index 964cc7e75dd867296b28b811f6a1735aa0ad6113..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/channel_add_kernel.cl
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void channel_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage,int w) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     int2 coords_bias;
-     coords_bias.x = x/w;
-     coords_bias.y = 0;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords_bias);
-     half4 output = in + biase;
-     write_imageh(outputImage,coords,output);
- }
-
-__kernel void width_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t
-outputImage,int w) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-  int2 coords_bias;
-  coords_bias.x = x % w;
-  coords_bias.y = 0;
-  half4 in = read_imageh(input, sampler, coords);
-  half4 biase = read_imageh(bias, sampler, coords_bias);
-  half4 output;
-  output.x = in.x + biase.x;
-  output.y = in.y + biase.x;
-  output.z = in.z + biase.x;
-  output.w = in.w + biase.x;
-  write_imageh(outputImage,coords,output);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h b/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
deleted file mode 100644
index 34f36eb9a3ffbdc5781c974926ea4a7d5258636b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/cl_common.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-inline half4 activation(half4 in
-#ifdef PRELU
-                        ,
-                        half4 prelu_alpha
-#endif
-) {
-  half4 output;
-#ifdef PRELU
-  output = select(prelu_alpha * in, in, in >= (half4)0.0);
-#endif
-
-#ifdef RELU
-  output = fmax(in, (half4)(0.0f));
-#endif
-  return output;
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
deleted file mode 100644
index c636bf5fd4f51a190d5a667f8af4ae45f2cabeac..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-
-__kernel void concatByCWith2Inputs(__read_only image2d_t input_image_0,
-                    __read_only image2d_t input_image_1,
-                    __private const int C_0,
-                    __private const int C_1,
-                    __write_only image2d_t output_image,
-                    __private const int out_C,
-                    __private const int out_W) {
-                      const int out_c = get_global_id(0);
-                      const int out_w = get_global_id(1);
-                      const int out_nh = get_global_id(2);
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                      int2 output_pos;
-                      output_pos.x = out_c * out_W + out_w;
-                      output_pos.y = out_nh;
-                      half4 output_data;
-
-                      for (int i = 0; i < 4; i++) {
-                        int c = out_c * 4 + i;
-                        if (c >= out_C) {
-                            break;
-                        }
-                        int c_in;
-                        half4 input_data;
-                        if (c < C_0) {
-                          c_in = c;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_0, sampler, input_pos);
-                        } else {
-                          c_in = c - C_0;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_1, sampler, input_pos);
-                        }
-                        int value_offset = c_in % 4;
-                        float value;
-                        if (value_offset == 0) {
-                          value = input_data.x;
-                        } else if (value_offset == 1) {
-                          value = input_data.y;
-                        } else if (value_offset == 2) {
-                          value = input_data.z;
-                        } else if (value_offset == 3) {
-                          value = input_data.w;
-                        }
-                        if (i == 0) {
-                          output_data.x = value;
-                        } else if (i == 1) {
-                          output_data.y = value;
-                        } else if (i == 2) {
-                          output_data.z = value;
-                        } else if (i == 3) {
-                          output_data.w = value;
-                        }
-                      }
-                      write_imageh(output_image, output_pos, output_data);
-}
-
-__kernel void concatByCWith3Inputs(__read_only image2d_t input_image_0,
-                    __read_only image2d_t input_image_1,
-                    __read_only image2d_t input_image_2,
-                    __private const int C_0,
-                    __private const int C_1,
-                    __private const int C_2,
-                    __write_only image2d_t output_image,
-                    __private const int out_C,
-                    __private const int out_W) {
-                      const int out_c = get_global_id(0);
-                      const int out_w = get_global_id(1);
-                      const int out_nh = get_global_id(2);
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                      int2 output_pos;
-                      output_pos.x = out_c * out_W + out_w;
-                      output_pos.y = out_nh;
-                      half4 output_data;
-
-                      for (int i = 0; i < 4; i++) {
-                        int c = out_c * 4 + i;
-                        if (c >= out_C) {
-                            break;
-                        }
-                        int c_in;
-                        half4 input_data;
-                        if (c < C_0) {
-                          c_in = c;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_0, sampler, input_pos);
-                        } else if (c < C_0 + C_1) {
-                          c_in = c - C_0;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_1, sampler, input_pos);
-                        } else {
-                          c_in = c - C_0 - C_1;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_2, sampler, input_pos);
-                        }
-                        int value_offset = c_in % 4;
-                        float value;
-                        if (value_offset == 0) {
-                          value = input_data.x;
-                        } else if (value_offset == 1) {
-                          value = input_data.y;
-                        } else if (value_offset == 2) {
-                          value = input_data.z;
-                        } else if (value_offset == 3) {
-                          value = input_data.w;
-                        }
-                        if (i == 0) {
-                          output_data.x = value;
-                        } else if (i == 1) {
-                          output_data.y = value;
-                        } else if (i == 2) {
-                          output_data.z = value;
-                        } else if (i == 3) {
-                          output_data.w = value;
-                        }
-                      }
-                      write_imageh(output_image, output_pos, output_data);
-}
-
-
-__kernel void concatByCWith4Inputs(__read_only image2d_t input_image_0,
-                    __read_only image2d_t input_image_1,
-                    __read_only image2d_t input_image_2,
-                    __read_only image2d_t input_image_3,
-                    __private const int C_0,
-                    __private const int C_1,
-                    __private const int C_2,
-                    __private const int C_3,
-                    __write_only image2d_t output_image,
-                    __private const int out_C,
-                    __private const int out_W) {
-                      const int out_c = get_global_id(0);
-                      const int out_w = get_global_id(1);
-                      const int out_nh = get_global_id(2);
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                      int2 output_pos;
-                      output_pos.x = out_c * out_W + out_w;
-                      output_pos.y = out_nh;
-                      half4 output_data;
-
-                      for (int i = 0; i < 4; i++) {
-                        int c = out_c * 4 + i;
-                        if (c >= out_C) {
-                            break;
-                        }
-                        int c_in;
-                        half4 input_data;
-                        if (c < C_0) {
-                          c_in = c;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_0, sampler, input_pos);
-                        } else if (c < C_0 + C_1) {
-                          c_in = c - C_0;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_1, sampler, input_pos);
-                        } else if (c < C_0 + C_1 + C_2) {
-                          c_in = c - C_0 - C_1;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_2, sampler, input_pos);
-                        }else if (c < C_0 + C_1 + C_2 + C_3){
-                          c_in = c - C_0 - C_1 - C_2;
-                          int2 input_pos;
-                          input_pos.x = (c_in / 4) * out_W + out_w;
-                          input_pos.y = out_nh;
-                          input_data = read_imageh(input_image_3, sampler, input_pos);
-                        }
-                        int value_offset = c_in % 4;
-                        float value;
-                        if (value_offset == 0) {
-                          value = input_data.x;
-                        } else if (value_offset == 1) {
-                          value = input_data.y;
-                        } else if (value_offset == 2) {
-                          value = input_data.z;
-                        } else if (value_offset == 3) {
-                          value = input_data.w;
-                        }
-                        if (i == 0) {
-                          output_data.x = value;
-                        } else if (i == 1) {
-                          output_data.y = value;
-                        } else if (i == 2) {
-                          output_data.z = value;
-                        } else if (i == 3) {
-                          output_data.w = value;
-                        }
-                      }
-                      write_imageh(output_image, output_pos, output_data);
-}
-
-__kernel void concatByH(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W,
-                      __private const int out_H_Start) {
-
-                      const int in_c = get_global_id(0);
-                      const int in_w = get_global_id(1);
-                      const int in_nh = get_global_id(2);
-
-                      int2 input_pos;
-                      input_pos.x = in_c * out_W + in_w;
-                      input_pos.y = in_nh;
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-                      half4 input;
-                      input = read_imageh(input_image, sampler,input_pos);
-
-                      int2 output_pos;
-                      output_pos.x = input_pos.x;
-                      output_pos.y = out_H_Start + input_pos.y;
-
-                      write_imageh(output_image, output_pos, input);
-
-}
-
-__kernel void concatByW(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int in_W,
-                      __private const int pre_Width,
-                      __private const int out_Width) {
-
-                      const int in_c = get_global_id(0);
-                      const int in_w = get_global_id(1);
-                      const int in_nh = get_global_id(2);
-
-                      int2 input_pos;
-                      input_pos.x = in_c * in_W + in_w;
-                      input_pos.y = in_nh;
-
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-                      half4 input;
-                      input = read_imageh(input_image, sampler,input_pos);
-
-                      int2 output_pos;
-                      output_pos.x = input_pos.x + pre_Width + out_Width * in_c;
-                      output_pos.y = input_pos.y;
-                      write_imageh(output_image, output_pos, input);
-
-}
-
-
-
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
deleted file mode 100644
index 2a5c823295c7562361433414cf35be81d2fbf00c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "conv_kernel.inc.cl"
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
deleted file mode 100644
index bf31f329708aacac59f3a67cf987998a8a4a28dd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ /dev/null
@@ -1,2836 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
-conv
-conv_bn
-conv_add
-conv_relu
-conv_bn_relu
-conv_add_relu
-conv_add_bn_relu
-*/
-
-#include "cl_common.h"
-
-__kernel void conv_3x3(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-    __private const int output_c, __private const int filter_channel,
-    __private const int group) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
-
-  int2 stride_xy;
-  stride_xy.x = stride;
-  stride_xy.y = stride;
-
-  int2 ouput_pos_in_one_block;
-  ouput_pos_in_one_block.x = out_w;
-  ouput_pos_in_one_block.y = out_nh;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 in_pos_in_one_block;
-  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-#ifdef BIASE_CH
-  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output = read_imageh(bias, sampler, output_pos);
-#else
-  half4 output = 0.0f;
-#endif
-
-  half4 input[9];
-  if (group == 1) {
-    for (int i = 0; i < input_c; ++i) {
-      int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x,
-                           in_pos_in_one_block.y);
-      input[0] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                            in_pos_in_one_block.y - dilation < 0 ||
-                            in_pos_in_one_block.x - dilation >= input_width ||
-                            in_pos_in_one_block.y - dilation >= input_height)
-                           << 15));
-
-      input[1] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x, pos_in.y - dilation)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x < 0 ||
-                            in_pos_in_one_block.y - dilation < 0 ||
-                            in_pos_in_one_block.x >= input_width ||
-                            in_pos_in_one_block.y - dilation >= input_height)
-                           << 15));
-
-      input[2] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                            in_pos_in_one_block.y - dilation < 0 ||
-                            in_pos_in_one_block.x + dilation >= input_width ||
-                            in_pos_in_one_block.y - dilation >= input_height)
-                           << 15));
-
-      input[3] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x - dilation, pos_in.y)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                            in_pos_in_one_block.y < 0 ||
-                            in_pos_in_one_block.x - dilation >= input_width ||
-                            in_pos_in_one_block.y >= input_height)
-                           << 15));
-
-      input[4] = select(
-          read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
-          (half4)(0.0f),
-          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
-                     in_pos_in_one_block.x >= input_width ||
-                     in_pos_in_one_block.y >= input_height)
-                    << 15));
-
-      input[5] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x + dilation, pos_in.y)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                            in_pos_in_one_block.y < 0 ||
-                            in_pos_in_one_block.x + dilation >= input_width ||
-                            in_pos_in_one_block.y >= input_height)
-                           << 15));
-
-      input[6] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                            in_pos_in_one_block.y + dilation < 0 ||
-                            in_pos_in_one_block.x - dilation >= input_width ||
-                            in_pos_in_one_block.y + dilation >= input_height)
-                           << 15));
-
-      input[7] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x, pos_in.y + dilation)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x < 0 ||
-                            in_pos_in_one_block.y + dilation < 0 ||
-                            in_pos_in_one_block.x >= input_width ||
-                            in_pos_in_one_block.y + dilation >= input_height)
-                           << 15));
-
-      input[8] =
-          select(read_imageh(input_image, sampler,
-                             (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-                 (half4)(0.0f),
-                 (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                            in_pos_in_one_block.y + dilation < 0 ||
-                            in_pos_in_one_block.x + dilation >= input_width ||
-                            in_pos_in_one_block.y + dilation >= input_height)
-                           << 15));
-
-      /*
-                  for (int j = 0; j < 9; ++j) {
-                      int2 pos_of_weight;
-                      pos_of_weight.x = i * 3 + j % 3;
-                      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                      half4 weight_x = read_imageh(filter, sampler,
-         pos_of_weight);
-                      output.x += dot(input[j], weight_x);
-
-                      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                      half4 weight_y = read_imageh(filter, sampler,
-         pos_of_weight);
-                      output.y += dot(input[j], weight_y);
-
-                      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                      half4 weight_z = read_imageh(filter, sampler,
-         pos_of_weight);
-                      output.z += dot(input[j], weight_z);
-
-                      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                      half4 weight_w = read_imageh(filter, sampler,
-         pos_of_weight);
-                      output.w += dot(input[j], weight_w);
-                  }
-      */
-      int j = 0;
-      int2 pos_of_weight;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 1;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 2;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 3;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 4;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 5;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 6;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 7;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-
-      j = 8;
-      pos_of_weight.x = i * 3 + j % 3;
-      pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-      weight_x = read_imageh(filter, sampler, pos_of_weight);
-      output.x += dot(input[j], weight_x);
-
-      pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-      weight_y = read_imageh(filter, sampler, pos_of_weight);
-      output.y += dot(input[j], weight_y);
-
-      pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-      weight_z = read_imageh(filter, sampler, pos_of_weight);
-      output.z += dot(input[j], weight_z);
-
-      pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-      weight_w = read_imageh(filter, sampler, pos_of_weight);
-      output.w += dot(input[j], weight_w);
-    }
-  } else {
-    for (int i = 0; i < 4; i++) {
-      int used_input_channel_num =
-          (out_c * 4 + i) / (output_c / group) * filter_channel;
-      for (int f_c = 0; f_c < filter_channel; ++f_c) {
-        int input_c = used_input_channel_num + f_c;
-        int input_block = input_c / 4;
-        int2 pos_in = (int2)(input_block * input_width + in_pos_in_one_block.x,
-                             in_pos_in_one_block.y);
-        input[0] = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-            (half4)(0.0f),
-            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                       in_pos_in_one_block.y - dilation < 0 ||
-                       in_pos_in_one_block.x - dilation >= input_width ||
-                       in_pos_in_one_block.y - dilation >= input_height)
-                      << 15));
-        input[1] =
-            select(read_imageh(input_image, sampler,
-                               (int2)(pos_in.x, pos_in.y - dilation)),
-                   (half4)(0.0f),
-                   (ushort4)((in_pos_in_one_block.x < 0 ||
-                              in_pos_in_one_block.y - dilation < 0 ||
-                              in_pos_in_one_block.x >= input_width ||
-                              in_pos_in_one_block.y - dilation >= input_height)
-                             << 15));
-        input[2] = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-            (half4)(0.0f),
-            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                       in_pos_in_one_block.y - dilation < 0 ||
-                       in_pos_in_one_block.x + dilation >= input_width ||
-                       in_pos_in_one_block.y - dilation >= input_height)
-                      << 15));
-        input[3] =
-            select(read_imageh(input_image, sampler,
-                               (int2)(pos_in.x - dilation, pos_in.y)),
-                   (half4)(0.0f),
-                   (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                              in_pos_in_one_block.y < 0 ||
-                              in_pos_in_one_block.x - dilation >= input_width ||
-                              in_pos_in_one_block.y >= input_height)
-                             << 15));
-        input[4] = select(
-            read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
-            (half4)(0.0f),
-            (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
-                       in_pos_in_one_block.x >= input_width ||
-                       in_pos_in_one_block.y >= input_height)
-                      << 15));
-        input[5] =
-            select(read_imageh(input_image, sampler,
-                               (int2)(pos_in.x + dilation, pos_in.y)),
-                   (half4)(0.0f),
-                   (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                              in_pos_in_one_block.y < 0 ||
-                              in_pos_in_one_block.x + dilation >= input_width ||
-                              in_pos_in_one_block.y >= input_height)
-                             << 15));
-        input[6] = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-            (half4)(0.0f),
-            (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                       in_pos_in_one_block.y + dilation < 0 ||
-                       in_pos_in_one_block.x - dilation >= input_width ||
-                       in_pos_in_one_block.y + dilation >= input_height)
-                      << 15));
-        input[7] =
-            select(read_imageh(input_image, sampler,
-                               (int2)(pos_in.x, pos_in.y + dilation)),
-                   (half4)(0.0f),
-                   (ushort4)((in_pos_in_one_block.x < 0 ||
-                              in_pos_in_one_block.y + dilation < 0 ||
-                              in_pos_in_one_block.x >= input_width ||
-                              in_pos_in_one_block.y + dilation >= input_height)
-                             << 15));
-        input[8] = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-            (half4)(0.0f),
-            (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                       in_pos_in_one_block.y + dilation < 0 ||
-                       in_pos_in_one_block.x + dilation >= input_width ||
-                       in_pos_in_one_block.y + dilation >= input_height)
-                      << 15));
-
-        half tmp_out = 0;
-        for (int j = 0; j < 9; j++) {
-          int2 pos_of_weight;
-          pos_of_weight.x = (f_c / 4) * 3 + j % 3;
-          pos_of_weight.y = out_c * 4 * 3 + i * 3 + j / 3;
-          half4 weight = read_imageh(filter, sampler, pos_of_weight);
-          int f_c_offset = f_c % 4;
-          half f_value;
-          if (f_c_offset == 0) {
-            f_value = weight.x;
-          } else if (f_c_offset == 1) {
-            f_value = weight.y;
-          } else if (f_c_offset == 2) {
-            f_value = weight.z;
-          } else if (f_c_offset == 3) {
-            f_value = weight.w;
-          }
-          int input_c_offset = input_c % 4;
-          half input_value;
-          if (input_c_offset == 0) {
-            input_value = input[j].x;
-          } else if (input_c_offset == 1) {
-            input_value = input[j].y;
-          } else if (input_c_offset == 2) {
-            input_value = input[j].z;
-          } else if (input_c_offset == 3) {
-            input_value = input[j].w;
-          }
-          tmp_out += f_value * input_value;
-        }
-
-        if (i == 0) {
-          output.x += tmp_out;
-        } else if (i == 1) {
-          output.y += tmp_out;
-        } else if (i == 2) {
-          output.z += tmp_out;
-        } else if (i == 3) {
-          output.w += tmp_out;
-        }
-      }
-    }
-  }
-
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-// dilation == 1
-__kernel void conv_3x3spl(
-    __private const int item_ch, __private const int item_w,
-    __private const int item_h, __read_only image2d_t input_image,
-    __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int pad, __private const int dilation,
-    __private const int in_ch, __private const int in_w,
-    __private const int in_h, __private const int out_w,
-    __private const int out_h) {
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  // item_id
-  const int item_ch_id = get_global_id(0);
-  const int item_w_id = get_global_id(1);
-  const int item_h_id = get_global_id(2);
-
-  // out_width_id_per_blk and out_batch_id
-  int out_batch_id = item_h_id / in_h;
-  int out_w_base_id = item_ch_id * out_w;
-  int out_w_id0 = item_w_id;
-  int out_w_id1 = out_w_id0 + item_w;
-  int out_w_id2 = out_w_id1 + item_w;
-  int out_w_id3 = out_w_id2 + item_w;
-  int out_w_id4 = out_w_id3 + item_w;
-
-  // in_width_id_per_blk and in_height_id_per_batch
-  int in_h_id = (item_h_id % out_h) * stride - pad;
-  int in_w_id0 = item_w_id * stride - pad;
-  int in_w_id1 = in_w_id0 + item_w * stride;
-  int in_w_id2 = in_w_id1 + item_w * stride;
-  int in_w_id3 = in_w_id2 + item_w * stride;
-  int in_w_id4 = in_w_id3 + item_w * stride;
-
-#ifdef BIASE_CH
-
-  half4 output[5];
-  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-  output[1] = output[0];
-  output[2] = output[0];
-  output[3] = output[0];
-  output[4] = output[0];
-
-#elif defined(BIASE_ELE)
-
-  half4 output[5];
-  output[0] =
-      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-  if (out_w_id1 < out_w) {
-    output[1] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id1, item_h_id));
-  }
-  if (out_w_id2 < out_w) {
-    output[2] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id2, item_h_id));
-  }
-  if (out_w_id3 < out_w) {
-    output[3] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id3, item_h_id));
-  }
-  if (out_w_id4 < out_w) {
-    output[4] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id4, item_h_id));
-  }
-#else
-  half4 output[5] = {0.0f};
-#endif
-
-  half4 filter[4] = {0.0f};
-  half4 filter_trans[4] = {0.0f};
-  half4 input[5] = {0.0f};
-
-  int filter_h_val0 = item_ch_id * 4 * 3;
-  int filter_h_val1 = filter_h_val0 + 3;
-  int filter_h_val2 = filter_h_val1 + 3;
-  int filter_h_val3 = filter_h_val2 + 3;
-
-  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-    const int in_w_base_id = mul24(ch, in_w);
-
-    int filter_w_val = ch * 3;
-
-    for (int h = 0; h < 3; h++) {
-      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                            (out_batch_id * in_h + in_h_id + h < 0 ||
-                             out_batch_id * in_h + in_h_id + h >= in_h));
-
-      for (int w = 0; w < 3; w++) {
-        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-        filter[0] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
-        filter[1] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
-        filter[2] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
-        filter[3] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
-
-        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
-                                  filter[3].x);  // in_ch:0,out_ch:0-3
-        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
-                                  filter[3].y);  // in_ch:1,out_ch:0-3
-        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
-                                  filter[3].z);  // in_ch:2,out_ch:0-3
-        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
-                                  filter[3].w);  // in_ch:3,out_ch:0-3
-
-        input[0] =
-            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-        input[1] =
-            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-        input[2] =
-            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-        input[3] =
-            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-        input[4] =
-            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-        output[0] = mad(input[0].x, filter_trans[0], output[0]);
-        output[1] = mad(input[1].x, filter_trans[0], output[1]);
-        output[2] = mad(input[2].x, filter_trans[0], output[2]);
-        output[3] = mad(input[3].x, filter_trans[0], output[3]);
-        output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-        if (ch_surplus < 3) {
-          output[0] = mad(input[0].y, filter_trans[1], output[0]);
-          output[1] = mad(input[1].y, filter_trans[1], output[1]);
-          output[2] = mad(input[2].y, filter_trans[1], output[2]);
-          output[3] = mad(input[3].y, filter_trans[1], output[3]);
-          output[4] = mad(input[4].y, filter_trans[1], output[4]);
-        }
-        if (ch_surplus < 2) {
-          output[0] = mad(input[0].z, filter_trans[2], output[0]);
-          output[1] = mad(input[1].z, filter_trans[2], output[1]);
-          output[2] = mad(input[2].z, filter_trans[2], output[2]);
-          output[3] = mad(input[3].z, filter_trans[2], output[3]);
-          output[4] = mad(input[4].z, filter_trans[2], output[4]);
-        }
-        if (ch_surplus < 1) {
-          output[0] = mad(input[0].w, filter_trans[3], output[0]);
-          output[1] = mad(input[1].w, filter_trans[3], output[1]);
-          output[2] = mad(input[2].w, filter_trans[3], output[2]);
-          output[3] = mad(input[3].w, filter_trans[3], output[3]);
-          output[4] = mad(input[4].w, filter_trans[3], output[4]);
-        }
-      }
-    }
-  }
-#ifdef BATCH_NORM
-  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-  output[0] = mad(scale, output[0], biase);
-  if (out_w_id1 < out_w) {
-    output[1] = mad(scale, output[1], biase);
-  }
-  if (out_w_id2 < out_w) {
-    output[2] = mad(scale, output[2], biase);
-  }
-  if (out_w_id3 < out_w) {
-    output[3] = mad(scale, output[3], biase);
-  }
-  if (out_w_id4 < out_w) {
-    output[4] = mad(scale, output[4], biase);
-  }
-#endif
-
-#ifdef RELU
-  output[0] = activation(output[0]);
-  output[1] = activation(output[1]);
-  output[2] = activation(output[2]);
-  output[3] = activation(output[3]);
-  output[4] = activation(output[4]);
-#endif
-  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
-               output[0]);
-  if (out_w_id1 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
-                 output[1]);
-  }
-  if (out_w_id2 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
-                 output[2]);
-  }
-  if (out_w_id3 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
-                 output[3]);
-  }
-  if (out_w_id4 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
-                 output[4]);
-  }
-}
-
-__kernel void depth_conv_3x3(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const int batch_index = out_nh / output_height;
-
-  const int out_nh_in_one_batch = out_nh % output_height;
-
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
-
-  int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output = read_imageh(bias, sampler, output_pos);
-#else
-  half4 output = 0.0f;
-#endif
-
-  const int filter_width = 3;
-  const int filter_height = 3;
-
-  int2 pos_in_input_block =
-      (int2)(out_c * input_width, batch_index * input_height);
-
-  int2 pos_in_filter_block =
-      (int2)(out_c * filter_width, batch_index * filter_height);
-
-  int filter_x = pos_in_filter_block.x;
-  int filter_y = pos_in_filter_block.y;
-
-  half4 inputs[9];
-
-  inputs[0] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                                in_pos_in_one_block.y - 1 < 0 ||
-                                in_pos_in_one_block.x - 1 >= input_width ||
-                                in_pos_in_one_block.y - 1 >= input_height)
-                               << 15));
-
-  inputs[1] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-      (half4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 ||
-                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y - 1 >= input_height)
-                << 15));
-
-  inputs[2] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                         pos_in_input_block.y + in_pos_in_one_block.y - 1)),
-      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                                in_pos_in_one_block.y - 1 < 0 ||
-                                in_pos_in_one_block.x + 1 >= input_width ||
-                                in_pos_in_one_block.y - 1 >= input_height)
-                               << 15));
-
-  inputs[3] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                         pos_in_input_block.y + in_pos_in_one_block.y)),
-      (half4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x - 1 >= input_width ||
-                 in_pos_in_one_block.y >= input_height)
-                << 15));
-  /*
-  if (output_pos.x == 112 && output_pos.y == 0) {
-        half4 input1 = inputs[3];
-        float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-        printf(" input4 3 - %v4hlf \n", in);
-        printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
-  }
-  */
-
-  inputs[4] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                         pos_in_input_block.y + in_pos_in_one_block.y)),
-      (half4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y >= input_height)
-                << 15));
-
-  inputs[5] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                         pos_in_input_block.y + in_pos_in_one_block.y)),
-      (half4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 ||
-                 in_pos_in_one_block.x + 1 >= input_width ||
-                 in_pos_in_one_block.y >= input_height)
-                << 15));
-
-  inputs[6] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1,
-                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x - 1 < 0 ||
-                                in_pos_in_one_block.y + 1 < 0 ||
-                                in_pos_in_one_block.x - 1 >= input_width ||
-                                in_pos_in_one_block.y + 1 >= input_height)
-                               << 15));
-
-  inputs[7] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x,
-                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-      (half4)(0.0f),
-      (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 ||
-                 in_pos_in_one_block.x >= input_width ||
-                 in_pos_in_one_block.y + 1 >= input_height)
-                << 15));
-
-  inputs[8] = select(
-      read_imageh(input, sampler,
-                  (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1,
-                         pos_in_input_block.y + in_pos_in_one_block.y + 1)),
-      (half4)(0.0f), (ushort4)((in_pos_in_one_block.x + 1 < 0 ||
-                                in_pos_in_one_block.y + 1 < 0 ||
-                                in_pos_in_one_block.x + 1 >= input_width ||
-                                in_pos_in_one_block.y + 1 >= input_height)
-                               << 15));
-
-  half4 filters[9];
-  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
-  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
-  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
-  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
-  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
-  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
-  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
-  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
-  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
-
-  for (int i = 0; i < 9; i++) {
-    output += inputs[i] * filters[i];
-  }
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  /*
-  if (output_pos.x == 112 && output_pos.y == 0) {
-      for (int i = 0; i < 9; ++i) {
-          half4 input1 = inputs[i];
-          float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
-          printf(" input4 %d - %v4hlf \n", i, in);
-      }
-      float4 out = (float4)(output.x, output.y, output.z, output.w);
-      printf(" depth wise output output4 = %v4hlf \n", out);
-      printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
-      printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
-      printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
-      printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
-  }
-  */
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void depth_conv_3x3s1(
-    __private const int ou_ch_blk, __private const int ou_w_blk,
-    __private const int ou_nh, __read_only image2d_t input,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int pad, __private const int dilation,
-    __private const int in_ch, __private const int in_w, /* of one block */
-    __private const int in_h,                            /* of one block */
-    __private const int ou_w, __private const int ou_h) {
-
-  const int ou_ch_blk_id = get_global_id(0);
-  const int ou_w_blk_id = get_global_id(1);
-  const int ou_nh_id = get_global_id(2);
-  const int w_blk_size = 2;
-
-  const int batch_id = ou_nh_id / ou_h;
-  int ou_col_id = ou_w_blk_id * w_blk_size;
-  int ou_row_id = ou_nh_id % ou_h;
-  int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
-
-  // input pos in one block and on batch
-  int col_id = ou_col_id - pad;
-  int row_id = ou_row_id - pad;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-#ifdef BIASE_CH
-  half4 output[2];
-  output[0] = read_imageh(bias, sampler, (int2)(ou_ch_blk_id, 0));
-  output[1] = output[0];
-#elif defined(BIASE_ELE)
-  half4 output[2];
-  output[0] = read_imageh(bias, sampler, (int2)(ou_x, ou_nh_id));
-  if (ou_col_id + 1 < ou_w) {
-    output[1] = read_imageh(bias, sampler, (int2)(ou_x + 1, ou_nh_id));
-  }
-#else
-  half4 output[2] = {0.0f};
-#endif
-
-  half4 inputs[12];
-
-  int filter_x = ou_ch_blk_id * 3;
-  int filter_y = 0;
-  half4 filters[9];
-  filters[0] = read_imageh(filter, sampler, (int2)(filter_x, filter_y));
-  filters[1] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y));
-  filters[2] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y));
-
-  int in_x = mad24(ou_ch_blk_id, in_w, col_id);
-  int in_y = mad24(batch_id, in_h, row_id);
-
-  int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
-  int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
-  inputs[0] = read_imageh(input, sampler, (int2)(x0, y0));
-  int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
-  inputs[1] = read_imageh(input, sampler, (int2)(x1, y0));
-  int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
-  inputs[2] = read_imageh(input, sampler, (int2)(x2, y0));
-  int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
-  inputs[3] = read_imageh(input, sampler, (int2)(x3, y0));
-
-  output[0] = mad(inputs[0], filters[0], output[0]);
-  output[1] = mad(inputs[1], filters[0], output[1]);
-
-  output[0] = mad(inputs[1], filters[1], output[0]);
-  output[1] = mad(inputs[2], filters[1], output[1]);
-
-  output[0] = mad(inputs[2], filters[2], output[0]);
-  output[1] = mad(inputs[3], filters[2], output[1]);
-
-  filters[3] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 1));
-  filters[4] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 1));
-  filters[5] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 1));
-
-  int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
-  inputs[4] = read_imageh(input, sampler, (int2)(x0, y1));
-  inputs[5] = read_imageh(input, sampler, (int2)(x1, y1));
-  inputs[6] = read_imageh(input, sampler, (int2)(x2, y1));
-  inputs[7] = read_imageh(input, sampler, (int2)(x3, y1));
-
-  output[0] = mad(inputs[4], filters[3], output[0]);
-  output[1] = mad(inputs[5], filters[3], output[1]);
-
-  output[0] = mad(inputs[5], filters[4], output[0]);
-  output[1] = mad(inputs[6], filters[4], output[1]);
-
-  output[0] = mad(inputs[6], filters[5], output[0]);
-  output[1] = mad(inputs[7], filters[5], output[1]);
-
-  filters[6] = read_imageh(filter, sampler, (int2)(filter_x, filter_y + 2));
-  filters[7] = read_imageh(filter, sampler, (int2)(filter_x + 1, filter_y + 2));
-  filters[8] = read_imageh(filter, sampler, (int2)(filter_x + 2, filter_y + 2));
-
-  int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
-  inputs[8] = read_imageh(input, sampler, (int2)(x0, y2));
-  inputs[9] = read_imageh(input, sampler, (int2)(x1, y2));
-  inputs[10] = read_imageh(input, sampler, (int2)(x2, y2));
-  inputs[11] = read_imageh(input, sampler, (int2)(x3, y2));
-
-  output[0] = mad(inputs[8], filters[6], output[0]);
-  output[1] = mad(inputs[9], filters[6], output[1]);
-
-  output[0] = mad(inputs[9], filters[7], output[0]);
-  output[1] = mad(inputs[10], filters[7], output[1]);
-
-  output[0] = mad(inputs[10], filters[8], output[0]);
-  output[1] = mad(inputs[11], filters[8], output[1]);
-#ifdef BATCH_NORM
-  half4 scale = read_imageh(new_scale, sampler, (int2)(ou_ch_blk_id, 0));
-  half4 biase = read_imageh(new_biase, sampler, (int2)(ou_ch_blk_id, 0));
-  output[0] = mad(scale, output[0], biase);
-  if (ou_col_id + 1 < ou_w) {
-    output[1] = mad(scale, output[1], biase);
-  }
-#endif
-
-#ifdef RELU
-  output[0] = activation(output[0]);
-  output[1] = activation(output[1]);
-#endif
-
-  write_imageh(output_image, (int2)(ou_x, ou_nh_id), output[0]);
-  if (ou_col_id + 1 < ou_w) {
-    write_imageh(output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
-  }
-}
-
-__kernel void conv_1x1(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const uint kernelHXW = 1;
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output = read_imageh(bias, sampler, output_pos);
-#else
-  half4 output = 0.0f;
-#endif
-
-  for (int i = 0; i < input_c; ++i) {
-    int2 pos_in =
-        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    half4 input = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-    /*
-            output.x = dot(input, weight0);
-            output.y = dot(input, weight1);
-            output.z = dot(input, weight2);
-            output.w = dot(input, weight3);
-    */
-
-    output = mad(input.x, weight0, output);
-    output = mad(input.y, weight1, output);
-    output = mad(input.z, weight2, output);
-    output = mad(input.w, weight3, output);
-  }
-
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-__kernel void conv_1x1_simple(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int input_c_origin, __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-    __private const int old_w) {
-  half zero = 0.0f;
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-
-  int outpos_main = mul24(out_c, old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output1 = output0;
-  half4 output2 = output0;
-  half4 output3 = output0;
-#elif defined(BIASE_ELE)
-  half4 output0 = read_imageh(bias, sampler, output_pos0);
-  half4 output1 = output0;
-  half4 output2 = output0;
-  half4 output3 = output0;
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-#endif
-
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
-                         in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
-                    in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
-                    in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
-                    in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
-  }
-
-#ifdef BATCH_NORM
-  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-  output2 = activation(output2);
-  output3 = activation(output3);
-#endif
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-
-  if (out_w1 < old_w) {
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  if (out_w2 < old_w) {
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  if (out_w3 < old_w) {
-    write_imageh(output_image, output_pos3, output3);
-  }
-}
-__kernel void conv_1x1_wrapped(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int input_c_origin, __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-    __private const int old_w) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-
-  int outpos_main = mul24(out_c, old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-#ifdef BIASE_CH
-  half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output0 = read_imageh(bias, sampler, output_pos0);
-  half4 output1 = read_imageh(bias, sampler, output_pos1);
-  half4 output2 = read_imageh(bias, sampler, output_pos2);
-  half4 output3 = read_imageh(bias, sampler, output_pos3);
-
-#else
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-#endif
-
-  int max_w_bound = input_c * input_width;
-  int burndary_index = input_c * 4 - input_c_origin;
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
-                         in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    if ((max_w_bound - pos_in.x - 1) < input_width &&
-        (max_w_bound - pos_in.x - 1) >= 0) {
-      if (burndary_index == 0) {
-        output0 = mad(input0.x, weight0, output0);
-        output0 = mad(input0.y, weight1, output0);
-        output0 = mad(input0.z, weight2, output0);
-        output0 = mad(input0.w, weight3, output0);
-      } else if (burndary_index == 1) {
-        output0 = mad(input0.x, weight0, output0);
-        output0 = mad(input0.y, weight1, output0);
-        output0 = mad(input0.z, weight2, output0);
-        output0 = mad(0.0f, weight3, output0);
-
-      } else if (burndary_index == 2) {
-        output0 = mad(input0.x, weight0, output0);
-        output0 = mad(input0.y, weight1, output0);
-        output0 = mad(0.0f, weight2, output0);
-        output0 = mad(0.0f, weight3, output0);
-      } else if (burndary_index == 3) {
-        output0 = mad(input0.x, weight0, output0);
-        output0 = mad(0.0f, weight1, output0);
-        output0 = mad(0.0f, weight2, output0);
-        output0 = mad(0.0f, weight3, output0);
-      }
-    } else {
-      output0 = mad(input0.x, weight0, output0);
-      output0 = mad(input0.y, weight1, output0);
-      output0 = mad(input0.z, weight2, output0);
-      output0 = mad(input0.w, weight3, output0);
-    }
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
-                    in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-
-    if (abs(max_w_bound - pos_in.x) < input_width) {
-      if (burndary_index == 0) {
-        output1 = mad(input1.x, weight0, output1);
-        output1 = mad(input1.y, weight1, output1);
-        output1 = mad(input1.z, weight2, output1);
-        output1 = mad(input1.w, weight3, output1);
-      } else if (burndary_index == 1) {
-        output1 = mad(input1.x, weight0, output1);
-        output1 = mad(input1.y, weight1, output1);
-        output1 = mad(input1.z, weight2, output1);
-        output1 = mad(0.0f, weight3, output1);
-
-      } else if (burndary_index == 2) {
-        output1 = mad(input1.x, weight0, output1);
-        output1 = mad(input1.y, weight1, output1);
-        output1 = mad(0.0f, weight2, output1);
-        output1 = mad(0.0f, weight3, output1);
-      } else if (burndary_index == 3) {
-        output1 = mad(input1.x, weight0, output1);
-        output1 = mad(0.0f, weight1, output1);
-        output1 = mad(0.0f, weight2, output1);
-        output1 = mad(0.0f, weight3, output1);
-      }
-    } else {
-      output1 = mad(input1.x, weight0, output1);
-      output1 = mad(input1.y, weight1, output1);
-      output1 = mad(input1.z, weight2, output1);
-      output1 = mad(input1.w, weight3, output1);
-    }
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
-                    in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    if (abs(max_w_bound - pos_in.x) < input_width) {
-      if (burndary_index == 0) {
-        output2 = mad(input2.x, weight0, output2);
-        output2 = mad(input2.y, weight1, output2);
-        output2 = mad(input2.z, weight2, output2);
-        output2 = mad(input2.w, weight3, output2);
-      } else if (burndary_index == 1) {
-        output2 = mad(input2.x, weight0, output2);
-        output2 = mad(input2.y, weight1, output2);
-        output2 = mad(input2.z, weight2, output2);
-        output2 = mad(0.0f, weight3, output2);
-
-      } else if (burndary_index == 2) {
-        output2 = mad(input2.x, weight0, output2);
-        output2 = mad(input2.y, weight1, output2);
-        output2 = mad(0.0f, weight2, output2);
-        output2 = mad(0.0f, weight3, output2);
-      } else if (burndary_index == 3) {
-        output2 = mad(input2.x, weight0, output2);
-        output2 = mad(0.0f, weight1, output2);
-        output2 = mad(0.0f, weight2, output2);
-        output2 = mad(0.0f, weight3, output2);
-      }
-    } else {
-      output2 = mad(input2.x, weight0, output2);
-      output2 = mad(input2.y, weight1, output2);
-      output2 = mad(input2.z, weight2, output2);
-      output2 = mad(input2.w, weight3, output2);
-    }
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
-                    in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    if (abs(max_w_bound - pos_in.x) < input_width) {
-      if (burndary_index == 0) {
-        output3 = mad(input3.x, weight0, output3);
-        output3 = mad(input3.y, weight1, output3);
-        output3 = mad(input3.z, weight2, output3);
-        output3 = mad(input3.w, weight3, output3);
-      } else if (burndary_index == 1) {
-        output3 = mad(input3.x, weight0, output3);
-        output3 = mad(input3.y, weight1, output3);
-        output3 = mad(input3.z, weight2, output3);
-        output3 = mad(0.0f, weight3, output3);
-
-      } else if (burndary_index == 2) {
-        output3 = mad(input3.x, weight0, output3);
-        output3 = mad(input3.y, weight1, output3);
-        output3 = mad(0.0f, weight2, output3);
-        output3 = mad(0.0f, weight3, output3);
-      } else if (burndary_index == 3) {
-        output3 = mad(input3.x, weight0, output3);
-        output3 = mad(0.0f, weight1, output3);
-        output3 = mad(0.0f, weight2, output3);
-        output3 = mad(0.0f, weight3, output3);
-      }
-    } else {
-      output3 = mad(input3.x, weight0, output3);
-      output3 = mad(input3.y, weight1, output3);
-      output3 = mad(input3.z, weight2, output3);
-      output3 = mad(input3.w, weight3, output3);
-    }
-  }
-
-#ifdef BATCH_NORM
-  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-  output2 = activation(output2);
-  output3 = activation(output3);
-#endif
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-
-  if (out_w1 < old_w) {
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  if (out_w2 < old_w) {
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  if (out_w3 < old_w) {
-    write_imageh(output_image, output_pos3, output3);
-  }
-}
-
-__kernel void conv_7x7(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
-  const int filter_n0 = 4 * out_c + 0;
-  const int filter_n1 = 4 * out_c + 1;
-  const int filter_n2 = 4 * out_c + 2;
-  const int filter_n3 = 4 * out_c + 3;
-
-  int2 stride_xy;
-  stride_xy.x = stride;
-  stride_xy.y = stride;
-
-  int2 ouput_pos_in_one_block;
-  ouput_pos_in_one_block.x = out_w;
-  ouput_pos_in_one_block.y = out_nh;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 in_pos_in_one_block;
-  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-#ifdef BIASE_CH
-  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output = read_imageh(bias, sampler, output_pos);
-#else
-  half4 output = 0.0f;
-#endif
-
-  half4 input;
-  half4 filter[4];
-  int2 filter_pos0;
-  int2 filter_pos1;
-  int2 filter_pos2;
-  int2 filter_pos3;
-  for (int i = 0; i < input_c; ++i) {
-    int2 pos_in =
-        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    for (int j = 0; j < 7; j++) {
-      for (int k = 0; k < 7; k++) {
-        input = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x + (j - 3) * dilation,
-                               pos_in.y + (k - 3) * dilation)),
-            (half4)(0.0f),
-            (ushort4)(
-                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
-                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
-                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
-                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
-                << 15));
-        int filter_h = k;
-        int filter_w = j;
-        int filter_c = i;
-
-        filter_pos0.x = filter_c * 7 + filter_w;
-        filter_pos0.y = filter_n0 * 7 + filter_h;
-
-        filter_pos1.x = filter_c * 7 + filter_w;
-        filter_pos1.y = filter_n1 * 7 + filter_h;
-
-        filter_pos2.x = filter_c * 7 + filter_w;
-        filter_pos2.y = filter_n2 * 7 + filter_h;
-
-        filter_pos3.x = filter_c * 7 + filter_w;
-        filter_pos3.y = filter_n3 * 7 + filter_h;
-
-        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
-        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
-        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
-        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
-
-        output.x += dot(input, filter[0]);
-        output.y += dot(input, filter[1]);
-        output.z += dot(input, filter[2]);
-        output.w += dot(input, filter[3]);
-      }
-    }
-  }
-
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void conv_7x7Pt1x2(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-
-  const int out_c = get_global_id(0);
-  const int out_w1 = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  if (out_c >= global_size_dim0 || out_w1 >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
-  const int out_w = out_w1 * 2;
-
-  int2 output_pos = (int2)(out_c * output_width + out_w, out_nh);
-
-  const int filter_n0 = 4 * out_c + 0;
-  const int filter_n1 = 4 * out_c + 1;
-  const int filter_n2 = 4 * out_c + 2;
-  const int filter_n3 = 4 * out_c + 3;
-
-  int2 stride_xy;
-  stride_xy.x = stride;
-  stride_xy.y = stride;
-
-  int2 ouput_pos_in_one_block;
-  ouput_pos_in_one_block.x = out_w;
-  ouput_pos_in_one_block.y = out_nh;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 in_pos_in_one_block;
-  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-#ifdef BIASE_CH
-  output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-  output1 = output0;
-#elif defined(BIASE_ELE)
-  output0 = read_imageh(bias, sampler, output_pos);
-  output1 = read_imageh(bias, sampler, (int2)(output_pos.x + 1, output_pos.y));
-#else
-  output0 = 0.0f;
-  output1 = 0.0f;
-#endif
-
-  half4 input[8];
-  half4 filter0[4];
-  half4 filter1[4];
-  half4 filter2[4];
-  half4 filter3[4];
-  int2 filter_pos0;
-  int2 filter_pos1;
-  int2 filter_pos2;
-  int2 filter_pos3;
-  for (int i = 0; i < input_c; ++i) {
-    int2 pos_in =
-        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    for (int k = 0; k < 7; k++) {
-      for (int j = 0; j < 8; j++) {
-        input[j] = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x + (j - 3) * dilation,
-                               pos_in.y + (k - 3) * dilation)),
-            (half4)(0.0f),
-            (ushort4)(
-                (in_pos_in_one_block.x + (j - 3) * dilation < 0 ||
-                 in_pos_in_one_block.y + (k - 3) * dilation < 0 ||
-                 in_pos_in_one_block.x + (j - 3) * dilation >= input_width ||
-                 in_pos_in_one_block.y + (k - 3) * dilation >= input_height)
-                << 15));
-
-        int filter_h = k;
-        int filter_w = j;
-        int filter_c = i;
-
-        if (j < 7) {
-          filter_pos0.x = filter_c * 7 + filter_w;
-          filter_pos0.y = filter_n0 * 7 + filter_h;
-
-          filter_pos1.x = filter_c * 7 + filter_w;
-          filter_pos1.y = filter_n1 * 7 + filter_h;
-
-          filter_pos2.x = filter_c * 7 + filter_w;
-          filter_pos2.y = filter_n2 * 7 + filter_h;
-
-          filter_pos3.x = filter_c * 7 + filter_w;
-          filter_pos3.y = filter_n3 * 7 + filter_h;
-
-          filter0[0] = read_imageh(filter_image, sampler, filter_pos0);
-          filter0[1] = read_imageh(filter_image, sampler, filter_pos1);
-          filter0[2] = read_imageh(filter_image, sampler, filter_pos2);
-          filter0[3] = read_imageh(filter_image, sampler, filter_pos3);
-
-          output0.x += dot(input[j], filter0[0]);
-          output0.y += dot(input[j], filter0[1]);
-          output0.z += dot(input[j], filter0[2]);
-          output0.w += dot(input[j], filter0[3]);
-        }
-
-        if (j > 0) {
-          output1.x += dot(input[j], filter1[0]);
-          output1.y += dot(input[j], filter1[1]);
-          output1.z += dot(input[j], filter1[2]);
-          output1.w += dot(input[j], filter1[3]);
-        }
-
-        filter1[0] = filter0[0];
-        filter1[1] = filter0[1];
-        filter1[2] = filter0[2];
-        filter1[3] = filter0[3];
-      }
-    }
-  }
-
-#ifdef BATCH_NORM
-  half4 s = read_imageh(new_scale, sampler, (int2)(out_c, 0));
-  half4 b = read_imageh(new_biase, sampler, (int2)(out_c, 0));
-  output0 = output0 * s + b;
-  output1 = output1 * s + b;
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-#endif
-  write_imageh(output_image, output_pos, output0);
-  if ((output_pos.x + 1) % output_width != 0) {
-    write_imageh(output_image, (int2)(output_pos.x + 1, output_pos.y), output1);
-  }
-}
-
-// dilation == 1
-__kernel void conv_7x7spl(
-    __private const int item_ch, __private const int item_w,
-    __private const int item_h, __read_only image2d_t input_image,
-    __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int pad, __private const int dilation,
-    __private const int in_ch, __private const int in_w,
-    __private const int in_h, __private const int out_w,
-    __private const int out_h) {
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  // filter
-  const int filter_w = 7;
-  const int filter_h = 7;
-
-  // item_id
-  const int item_ch_id = get_global_id(0);
-  const int item_w_id = get_global_id(1);
-  const int item_h_id = get_global_id(2);
-
-  // out_width_id_per_blk and out_batch_id
-  int out_batch_id = item_h_id / in_h;
-  int out_w_base_id = item_ch_id * out_w;
-  int out_w_id0 = item_w_id;
-  int out_w_id1 = out_w_id0 + item_w;
-  int out_w_id2 = out_w_id1 + item_w;
-  int out_w_id3 = out_w_id2 + item_w;
-  int out_w_id4 = out_w_id3 + item_w;
-
-  // in_width_id_per_blk and in_height_id_per_batch
-  int in_h_id = (item_h_id % out_h) * stride - pad;
-  int in_w_id0 = item_w_id * stride - pad;
-  int in_w_id1 = in_w_id0 + item_w * stride;
-  int in_w_id2 = in_w_id1 + item_w * stride;
-  int in_w_id3 = in_w_id2 + item_w * stride;
-  int in_w_id4 = in_w_id3 + item_w * stride;
-
-#ifdef BIASE_CH
-
-  half4 output[5];
-  output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-  output[1] = output[0];
-  output[2] = output[0];
-  output[3] = output[0];
-  output[4] = output[0];
-
-#elif defined(BIASE_ELE)
-
-  half4 output[5];
-  output[0] =
-      read_imageh(bias, sampler, (int2)(out_w_base_id + out_w_id0, item_h_id));
-  if (out_w_id1 < out_w) {
-    output[1] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id1, item_h_id));
-  }
-  if (out_w_id2 < out_w) {
-    output[2] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id2, item_h_id));
-  }
-  if (out_w_id3 < out_w) {
-    output[3] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id3, item_h_id));
-  }
-  if (out_w_id4 < out_w) {
-    output[4] = read_imageh(bias, sampler,
-                            (int2)(out_w_base_id + out_w_id4, item_h_id));
-  }
-#else
-  half4 output[5] = {0.0f};
-#endif
-
-  half4 filter[4] = {0.0f};
-  half4 filter_trans[4] = {0.0f};
-  half4 input[5] = {0.0f};
-
-  int filter_h_val0 = item_ch_id * 4 * filter_h;
-  int filter_h_val1 = filter_h_val0 + filter_h;
-  int filter_h_val2 = filter_h_val1 + filter_h;
-  int filter_h_val3 = filter_h_val2 + filter_h;
-
-  for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-    int ch_surplus = (ch + 1) * 4 - in_ch > 0 ? (ch + 1) * 4 - in_ch : 0;
-
-    const int in_w_base_id = mul24(ch, in_w);
-
-    int filter_w_val = ch * filter_w;
-
-    for (int h = 0; h < filter_h; h++) {
-      int in_h_val = select(out_batch_id * in_h + in_h_id + h, -1,
-                            (out_batch_id * in_h + in_h_id + h < 0 ||
-                             out_batch_id * in_h + in_h_id + h >= in_h));
-
-      for (int w = 0; w < filter_w; w++) {
-        int in_w_val0 = select(in_w_base_id + in_w_id0 + w, -1,
-                               (in_w_id0 + w < 0 || in_w_id0 + w >= in_w));
-        int in_w_val1 = select(in_w_base_id + in_w_id1 + w, -1,
-                               (in_w_id1 + w < 0 || in_w_id1 + w >= in_w));
-        int in_w_val2 = select(in_w_base_id + in_w_id2 + w, -1,
-                               (in_w_id2 + w < 0 || in_w_id2 + w >= in_w));
-        int in_w_val3 = select(in_w_base_id + in_w_id3 + w, -1,
-                               (in_w_id3 + w < 0 || in_w_id3 + w >= in_w));
-        int in_w_val4 = select(in_w_base_id + in_w_id4 + w, -1,
-                               (in_w_id4 + w < 0 || in_w_id4 + w >= in_w));
-
-        filter[0] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val0 + h));  // in_ch:0-3,out_ch:0
-        filter[1] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val1 + h));  // in_ch:0-3,out_ch:1
-        filter[2] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val2 + h));  // in_ch:0-3,out_ch:2
-        filter[3] = read_imageh(
-            filter_image, sampler,
-            (int2)(filter_w_val + w, filter_h_val3 + h));  // in_ch:0-3,out_ch:3
-
-        filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x,
-                                  filter[3].x);  // in_ch:0,out_ch:0-3
-        filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y,
-                                  filter[3].y);  // in_ch:1,out_ch:0-3
-        filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z,
-                                  filter[3].z);  // in_ch:2,out_ch:0-3
-        filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w,
-                                  filter[3].w);  // in_ch:3,out_ch:0-3
-
-        input[0] =
-            read_imageh(input_image, sampler, (int2)(in_w_val0, in_h_val));
-        input[1] =
-            read_imageh(input_image, sampler, (int2)(in_w_val1, in_h_val));
-        input[2] =
-            read_imageh(input_image, sampler, (int2)(in_w_val2, in_h_val));
-        input[3] =
-            read_imageh(input_image, sampler, (int2)(in_w_val3, in_h_val));
-        input[4] =
-            read_imageh(input_image, sampler, (int2)(in_w_val4, in_h_val));
-
-        output[0] = mad(input[0].x, filter_trans[0], output[0]);
-        output[1] = mad(input[1].x, filter_trans[0], output[1]);
-        output[2] = mad(input[2].x, filter_trans[0], output[2]);
-        output[3] = mad(input[3].x, filter_trans[0], output[3]);
-        output[4] = mad(input[4].x, filter_trans[0], output[4]);
-
-        if (ch_surplus < 3) {
-          output[0] = mad(input[0].y, filter_trans[1], output[0]);
-          output[1] = mad(input[1].y, filter_trans[1], output[1]);
-          output[2] = mad(input[2].y, filter_trans[1], output[2]);
-          output[3] = mad(input[3].y, filter_trans[1], output[3]);
-          output[4] = mad(input[4].y, filter_trans[1], output[4]);
-        }
-        if (ch_surplus < 2) {
-          output[0] = mad(input[0].z, filter_trans[2], output[0]);
-          output[1] = mad(input[1].z, filter_trans[2], output[1]);
-          output[2] = mad(input[2].z, filter_trans[2], output[2]);
-          output[3] = mad(input[3].z, filter_trans[2], output[3]);
-          output[4] = mad(input[4].z, filter_trans[2], output[4]);
-        }
-        if (ch_surplus < 1) {
-          output[0] = mad(input[0].w, filter_trans[3], output[0]);
-          output[1] = mad(input[1].w, filter_trans[3], output[1]);
-          output[2] = mad(input[2].w, filter_trans[3], output[2]);
-          output[3] = mad(input[3].w, filter_trans[3], output[3]);
-          output[4] = mad(input[4].w, filter_trans[3], output[4]);
-        }
-      }
-    }
-  }
-#ifdef BATCH_NORM
-  half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-  half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-  output[0] = mad(scale, output[0], biase);
-  if (out_w_id1 < out_w) {
-    output[1] = mad(scale, output[1], biase);
-  }
-  if (out_w_id2 < out_w) {
-    output[2] = mad(scale, output[2], biase);
-  }
-  if (out_w_id3 < out_w) {
-    output[3] = mad(scale, output[3], biase);
-  }
-  if (out_w_id4 < out_w) {
-    output[4] = mad(scale, output[4], biase);
-  }
-#endif
-
-#ifdef RELU
-  output[0] = activation(output[0]);
-  output[1] = activation(output[1]);
-  output[2] = activation(output[2]);
-  output[3] = activation(output[3]);
-  output[4] = activation(output[4]);
-#endif
-  write_imageh(output_image, (int2)(out_w_base_id + out_w_id0, item_h_id),
-               output[0]);
-  if (out_w_id1 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id1, item_h_id),
-                 output[1]);
-  }
-  if (out_w_id2 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id2, item_h_id),
-                 output[2]);
-  }
-  if (out_w_id3 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id3, item_h_id),
-                 output[3]);
-  }
-  if (out_w_id4 < out_w) {
-    write_imageh(output_image, (int2)(out_w_base_id + out_w_id4, item_h_id),
-                 output[4]);
-  }
-}
-
-__kernel void conv_5x5(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
-  const filter_n0 = 4 * out_c + 0;
-  const filter_n1 = 4 * out_c + 1;
-  const filter_n2 = 4 * out_c + 2;
-  const filter_n3 = 4 * out_c + 3;
-
-  int2 stride_xy;
-  stride_xy.x = stride;
-  stride_xy.y = stride;
-
-  int2 ouput_pos_in_one_block;
-  ouput_pos_in_one_block.x = out_w;
-  ouput_pos_in_one_block.y = out_nh;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 in_pos_in_one_block;
-  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-#ifdef BIASE_CH
-  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output = read_imageh(bias, sampler, output_pos);
-#else
-  half4 output = 0.0f;
-#endif
-
-  half4 input;
-  half4 filter[4];
-  int2 filter_pos0;
-  int2 filter_pos1;
-  int2 filter_pos2;
-  int2 filter_pos3;
-  for (int i = 0; i < input_c; ++i) {
-    int2 pos_in =
-        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    for (int j = 0; j < 5; j++) {
-      for (int k = 0; k < 5; k++) {
-        input = select(
-            read_imageh(input_image, sampler,
-                        (int2)(pos_in.x + (j - 2) * dilation,
-                               pos_in.y + (k - 2) * dilation)),
-            (half4)(0.0f),
-            (ushort4)(
-                (in_pos_in_one_block.x + (j - 2) * dilation < 0 ||
-                 in_pos_in_one_block.y + (k - 2) * dilation < 0 ||
-                 in_pos_in_one_block.x + (j - 2) * dilation >= input_width ||
-                 in_pos_in_one_block.y + (k - 2) * dilation >= input_height)
-                << 15));
-        int filter_h = k;
-        int filter_w = j;
-        int filter_c = i;
-
-        filter_pos0.x = filter_c * 5 + filter_w;
-        filter_pos0.y = filter_n0 * 5 + filter_h;
-
-        filter_pos1.x = filter_c * 5 + filter_w;
-        filter_pos1.y = filter_n1 * 5 + filter_h;
-
-        filter_pos2.x = filter_c * 5 + filter_w;
-        filter_pos2.y = filter_n2 * 5 + filter_h;
-
-        filter_pos3.x = filter_c * 5 + filter_w;
-        filter_pos3.y = filter_n3 * 5 + filter_h;
-
-        filter[0] = read_imageh(filter_image, sampler, filter_pos0);
-        filter[1] = read_imageh(filter_image, sampler, filter_pos1);
-        filter[2] = read_imageh(filter_image, sampler, filter_pos2);
-        filter[3] = read_imageh(filter_image, sampler, filter_pos3);
-
-        output.x += dot(input, filter[0]);
-        output.y += dot(input, filter[1]);
-        output.z += dot(input, filter[2]);
-        output.w += dot(input, filter[3]);
-      }
-    }
-  }
-
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void convBNAdd_3x3(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  if (out_c >= global_size_dim0 || out_w >= global_size_dim1 ||
-      out_nh >= global_size_dim2) {
-    return;
-  }
-
-  int2 stride_xy;
-  stride_xy.x = stride;
-  stride_xy.y = stride;
-
-  int2 ouput_pos_in_one_block;
-  ouput_pos_in_one_block.x = out_w;
-  ouput_pos_in_one_block.y = out_nh;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 in_pos_in_one_block;
-  in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
-  in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
-
-  half4 output = (half4)0.0f;
-
-  half4 input[9];
-
-  for (int i = 0; i < input_c; ++i) {
-    int2 pos_in =
-        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    input[0] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x - dilation, pos_in.y - dilation)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                          in_pos_in_one_block.y - dilation < 0 ||
-                          in_pos_in_one_block.x - dilation >= input_width ||
-                          in_pos_in_one_block.y - dilation >= input_height)
-                         << 15));
-
-    input[1] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x, pos_in.y - dilation)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x < 0 ||
-                          in_pos_in_one_block.y - dilation < 0 ||
-                          in_pos_in_one_block.x >= input_width ||
-                          in_pos_in_one_block.y - dilation >= input_height)
-                         << 15));
-
-    input[2] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x + dilation, pos_in.y - dilation)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                          in_pos_in_one_block.y - dilation < 0 ||
-                          in_pos_in_one_block.x + dilation >= input_width ||
-                          in_pos_in_one_block.y - dilation >= input_height)
-                         << 15));
-
-    input[3] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x - dilation, pos_in.y)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                          in_pos_in_one_block.y < 0 ||
-                          in_pos_in_one_block.x - dilation >= input_width ||
-                          in_pos_in_one_block.y >= input_height)
-                         << 15));
-
-    input[4] = select(
-        read_imageh(input_image, sampler, (int2)(pos_in.x, pos_in.y)),
-        (half4)(0.0f),
-        (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 ||
-                   in_pos_in_one_block.x >= input_width ||
-                   in_pos_in_one_block.y >= input_height)
-                  << 15));
-
-    input[5] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x + dilation, pos_in.y)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                          in_pos_in_one_block.y < 0 ||
-                          in_pos_in_one_block.x + dilation >= input_width ||
-                          in_pos_in_one_block.y >= input_height)
-                         << 15));
-
-    input[6] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x - dilation, pos_in.y + dilation)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x - dilation < 0 ||
-                          in_pos_in_one_block.y + dilation < 0 ||
-                          in_pos_in_one_block.x - dilation >= input_width ||
-                          in_pos_in_one_block.y + dilation >= input_height)
-                         << 15));
-
-    input[7] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x, pos_in.y + dilation)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x < 0 ||
-                          in_pos_in_one_block.y + dilation < 0 ||
-                          in_pos_in_one_block.x >= input_width ||
-                          in_pos_in_one_block.y + dilation >= input_height)
-                         << 15));
-
-    input[8] =
-        select(read_imageh(input_image, sampler,
-                           (int2)(pos_in.x + dilation, pos_in.y + dilation)),
-               (half4)(0.0f),
-               (ushort4)((in_pos_in_one_block.x + dilation < 0 ||
-                          in_pos_in_one_block.y + dilation < 0 ||
-                          in_pos_in_one_block.x + dilation >= input_width ||
-                          in_pos_in_one_block.y + dilation >= input_height)
-                         << 15));
-
-    /*
-            for (int j = 0; j < 9; ++j) {
-                int2 pos_of_weight;
-                pos_of_weight.x = i * 3 + j % 3;
-                pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-                half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-                output.x += dot(input[j], weight_x);
-
-                pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-                half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-                output.y += dot(input[j], weight_y);
-
-                pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-                half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-                output.z += dot(input[j], weight_z);
-
-                pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-                half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-                output.w += dot(input[j], weight_w);
-            }
-    */
-    int j = 0;
-    int2 pos_of_weight;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 1;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 2;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 3;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 4;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 5;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 6;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 7;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-
-    j = 8;
-    pos_of_weight.x = i * 3 + j % 3;
-    pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
-    weight_x = read_imageh(filter, sampler, pos_of_weight);
-    output.x += dot(input[j], weight_x);
-
-    pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
-    weight_y = read_imageh(filter, sampler, pos_of_weight);
-    output.y += dot(input[j], weight_y);
-
-    pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
-    weight_z = read_imageh(filter, sampler, pos_of_weight);
-    output.z += dot(input[j], weight_z);
-
-    pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
-    weight_w = read_imageh(filter, sampler, pos_of_weight);
-    output.w += dot(input[j], weight_w);
-  }
-
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef BIASE_CH
-  output += read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  output += read_imageh(bias, sampler, output_pos);
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void convBNAdd_1x1(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const uint kernelHXW = 1;
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
-  int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-
-  half4 output = 0.0f;
-
-  for (int i = 0; i < input_c; ++i) {
-    int2 pos_in =
-        (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
-    half4 input = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-    /*
-            output.x = dot(input, weight0);
-            output.y = dot(input, weight1);
-            output.z = dot(input, weight2);
-            output.w = dot(input, weight3);
-    */
-
-    output = mad(input.x, weight0, output);
-    output = mad(input.y, weight1, output);
-    output = mad(input.z, weight2, output);
-    output = mad(input.w, weight3, output);
-  }
-
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef BIASE_CH
-  output += read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  output += read_imageh(bias, sampler, output_pos);
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-
-  write_imageh(output_image, output_pos, output);
-}
-
-__kernel void convBNAdd_1x1_spl(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input_image,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-    __private const int old_w) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int out_w0 = out_w;
-  int out_w1 = out_w + global_size_dim1;
-  int out_w2 = out_w + global_size_dim1 * 2;
-  int out_w3 = out_w + global_size_dim1 * 3;
-
-  int outpos_main = mul24(out_c, old_w);
-  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
-  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
-  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
-  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int2 stride_xy = (int2)(stride, stride);
-
-  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
-  int2 in_pos_in_one_block0 =
-      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
-  int2 in_pos_in_one_block1 =
-      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
-  int2 in_pos_in_one_block2 =
-      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
-
-  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
-  int2 in_pos_in_one_block3 =
-      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
-
-  half4 output0 = 0.0f;
-  half4 output1 = 0.0f;
-  half4 output2 = 0.0f;
-  half4 output3 = 0.0f;
-
-  for (int i = 0; i < input_c; ++i) {
-    // ------------0---------------
-    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
-                         in_pos_in_one_block0.y);
-    half4 input0 = read_imageh(input_image, sampler, pos_in);
-
-    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
-    half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
-    half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
-    half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
-
-    // -------------1--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
-                    in_pos_in_one_block1.y);
-    half4 input1 = read_imageh(input_image, sampler, pos_in);
-    //
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
-
-    // -------------2--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
-                    in_pos_in_one_block2.y);
-    half4 input2 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
-
-    // -------------3--------------
-    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
-                    in_pos_in_one_block3.y);
-    half4 input3 = read_imageh(input_image, sampler, pos_in);
-
-    //    half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 +
-    //    0)); half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4
-    //    + 1)); half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i *
-    //    4 + 2)); half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i
-    //    * 4 + 3));
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
-  }
-
-#ifdef BATCH_NORM
-  output0 = output0 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output1 = output1 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output2 = output2 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-
-  output3 = output3 * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-            read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef BIASE_CH
-  output0 += read_imageh(bias, sampler, (int2)(out_c, 0));
-  output1 += read_imageh(bias, sampler, (int2)(out_c, 0));
-  output2 += read_imageh(bias, sampler, (int2)(out_c, 0));
-  output3 += read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  output0 += read_imageh(bias, sampler, output_pos0);
-  output1 += read_imageh(bias, sampler, output_pos1);
-  output2 += read_imageh(bias, sampler, output_pos2);
-  output3 += read_imageh(bias, sampler, output_pos3);
-#endif
-
-#ifdef RELU
-  output0 = activation(output0);
-  output1 = activation(output1);
-  output2 = activation(output2);
-  output3 = activation(output3);
-#endif
-
-  if (out_w0 < old_w) {
-    write_imageh(output_image, output_pos0, output0);
-  }
-
-  if (out_w1 < old_w) {
-    write_imageh(output_image, output_pos1, output1);
-  }
-
-  if (out_w2 < old_w) {
-    write_imageh(output_image, output_pos2, output2);
-  }
-
-  if (out_w3 < old_w) {
-    write_imageh(output_image, output_pos3, output3);
-  }
-}
-
-__kernel void depth_conv(
-    __private const int global_size_dim0, __private const int global_size_dim1,
-    __private const int global_size_dim2, __read_only image2d_t input,
-    __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-    __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-    __read_only image2d_t new_scale, __read_only image2d_t new_biase,
-#endif
-    __write_only image2d_t output_image, __private const int stride,
-    __private const int offset, __private const int input_c,
-    __private const int dilation,
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-    __private const int filter_width, __private const int filter_height) {
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  const int batch_index = out_nh / output_height;
-  const int out_nh_in_one_batch = out_nh % output_height;
-  int2 stride_xy = (int2)(stride, stride);
-  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
-  int2 in_pos_in_one_block =
-      ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
-#ifdef BIASE_CH
-  half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
-#elif defined(BIASE_ELE)
-  half4 output = read_imageh(bias, sampler, output_pos);
-#else
-  half4 output = 0.0f;
-#endif
-
-  int2 pos_in_input_block =
-      (int2)(out_c * input_width, batch_index * input_height);
-  int2 pos_in_filter_block =
-      (int2)(out_c * filter_width, batch_index * filter_height);
-  int filter_x = pos_in_filter_block.x;
-  int filter_y = pos_in_filter_block.y;
-  int input_x_base = pos_in_input_block.x + in_pos_in_one_block.x;
-  int input_y_base = pos_in_input_block.y + in_pos_in_one_block.y;
-  int2 align = {filter_width / 2, filter_height / 2};
-  /*  if (output_pos.x == 0 && output_pos.y == 0){
-      printf("align.x=%d  align.y=%d \n ",align.x,align.y);
-      printf("stride=%d \n ",stride);
-    }*/
-  for (int fy = 0; fy < filter_height; ++fy) {
-    for (int fx = 0; fx < filter_width; ++fx) {
-      int x_off = fx - align.x;
-      int y_off = fy - align.y;
-      /*      if (output_pos.x == 0 && output_pos.y == 0){
-              printf("fx=%d  fy=%d \n ",fx,fy);
-              printf("x_off=%d  y_off=%d \n ",x_off,y_off);
-            }*/
-      half4 in = select(
-          read_imageh(input, sampler,
-                      (int2)(input_x_base + x_off, input_y_base + y_off)),
-          (half4)(0.0f),
-          (ushort4)((in_pos_in_one_block.x + x_off < 0 ||
-                     in_pos_in_one_block.y + y_off < 0 ||
-                     in_pos_in_one_block.x + x_off >= input_width ||
-                     in_pos_in_one_block.y + y_off >= input_height)
-                    << 15));
-      half4 f =
-          read_imageh(filter, sampler, (int2)(filter_x + fx, filter_y + fy));
-      output += in * f;
-      /*if (output_pos.x ==111  && output_pos.y == 0){
-        printf("in={ %f , %f , %f , %f } \n
-      ",convert_float(in.x),convert_float(in.y),convert_float(in.z),convert_float(in.w));
-        printf("filter={ %f , %f , %f , %f } \n
-      ",convert_float(f.x),convert_float(f.y),convert_float(f.z),convert_float(f.w));
-        printf("output={ %f , %f , %f , %f } \n
-      ",convert_float(output.x),convert_float(output.y),convert_float(output.z),convert_float(output.w));
-      }*/
-    }
-  }
-#ifdef BATCH_NORM
-  output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) +
-           read_imageh(new_biase, sampler, (int2)(out_c, 0));
-#endif
-
-#ifdef RELU
-  output = activation(output);
-#endif
-  write_imageh(output_image, output_pos, output);
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
deleted file mode 100644
index 96044b575e980cd1fcb4d2785c8adc4e83712196..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_transpose_kernel.cl
+++ /dev/null
@@ -1,553 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cl_common.h"
-
-__kernel void conv_transpose_b(__private const int input_c_block,
-                             __private const int input_width,/* of one block */
-                             __private const int input_height,/* of one block */
-                             __private const int output_width,
-                             __private const int output_height,
-                             __read_only image2d_t input_image,
-                             __read_only image2d_t filter,
-                             __write_only image2d_t output_image) {
-
-    const int out_c = get_global_id(0);
-    const int in_w = get_global_id(1);
-    const int in_nh = get_global_id(2);
-    const int n = in_nh / input_height;
-    const int h = in_nh % input_height;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    half4 input1, input2, input3, input4;
-    half4 output1 = 0.0f, output2 = 0.0f, output3 = 0.0f, output4 = 0.0f;
-    half4 w = 0.0f;
-    int2 pos_in;
-    for (int i = 0; i < input_c_block; i += 1) {
-        pos_in = (int2)(mad24(i, input_width, in_w), in_nh);
-        input1 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x, pos_in.y)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w < 0 || h < 0 || in_w >= input_width || h >= input_height) << 15));
-        input2 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x + 1, pos_in.y)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w + 1 < 0 || h < 0 || in_w + 1 >= input_width || h >= input_height) << 15));
-        input3 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x, pos_in.y + 1)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w < 0 || h + 1 < 0 || in_w >= input_width || h + 1 >= input_height) << 15));
-        input4 = select(read_imageh(input_image, sampler,
-                                    (int2)(pos_in.x + 1, pos_in.y + 1)),
-                        (half4)(0.0f),
-                        (ushort4)((in_w + 1 < 0 || h + 1 < 0 || in_w + 1 >= input_width || h + 1 >= input_height) << 15));
-
-        int wx = i * 3;
-        int wy = out_c * 4 * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.x += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.x += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.x += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.x += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.x += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.x += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.x += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.x += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.x += dot(input1, w);
-
-        wy = (out_c * 4 + 1) * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.y += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.y += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.y += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.y += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.y += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.y += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.y += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.y += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.y += dot(input1, w);
-
-        wy = (out_c * 4 + 2) * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.z += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.z += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.z += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.z += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.z += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.z += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.z += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.z += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.z += dot(input1, w);
-
-        wy = (out_c * 4 + 3) * 3;
-        w = read_imageh(filter, sampler, (int2)(wx, wy));
-        output4.w += dot(input4, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy));
-        output3.w += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy));
-        output4.w += dot(input3, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 1));
-        output2.w += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 1));
-        output1.w += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 1));
-        output2.w += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx, wy + 2));
-        output4.w += dot(input2, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 1, wy + 2));
-        output3.w += dot(input1, w);
-        w = read_imageh(filter, sampler, (int2)(wx + 2, wy + 2));
-        output4.w += dot(input1, w);
-    }
-
-    int2 pos_out = (int2)(out_c * output_width + 2 * in_w, n * output_height + 2 * h);
-    write_imageh(output_image, pos_out, output1);
-    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y), output2);
-    write_imageh(output_image, (int2)(pos_out.x, pos_out.y + 1), output3);
-    write_imageh(output_image, (int2)(pos_out.x + 1, pos_out.y + 1), output4);
-}
-
-__kernel void depthwise_transpose(__private const int item_ch,
-                               __private const int item_w,
-                               __private const int item_h,
-                               __read_only image2d_t input_image,
-                               __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                               __write_only image2d_t output_image,
-                               __private const int stride,
-                               __private const int pad,
-                               __private const int dilation,
-                               __private const int in_ch,
-                               __private const int in_w,
-                               __private const int in_h,
-                               __private const int out_w,
-                               __private const int out_h,
-                               __private const int filter_w,
-                               __private const int filter_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_id
-    int out_b_id = item_h_id / out_h;
-    int out_w_id_per_ch_blk = item_w_id;
-    int out_h_id_per_batch = item_h_id % out_h;
-    int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
-
-    // in_id
-    int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
-    in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
-    int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
-    in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
-
-    // filter_id
-    int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
-    int align_w = align_w_i % stride > 0 ?
-                  align_w_i % stride - stride : align_w_i % stride;
-    int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
-
-    int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
-    int align_h = align_h_i % stride > 0 ?
-                  align_h_i % stride - stride : align_h_i % stride;
-    int filter_h_id = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
-
-#ifdef BIASE_CH
-    half4 output;
-    output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-#elif defined(BIASE_ELE)
-    half4 output;
-    output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
-#else
-    half4 output = 0.0f;
-#endif
-    half4 filter = 0.0f;
-    half4 input = 0.0f;
-    for (int h = filter_h_id; h >= 0; h -= stride) {
-        int in_h_id = select(out_b_id * in_h + in_h_id_per_batch, -1,
-                             in_h_id_per_batch < 0 || in_h_id_per_batch >= in_h);
-        for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
-            int in_w_id = select(item_ch_id * in_w + in_w_id_per_ch_blk, -1,
-                                 in_w_id_per_ch_blk < 0 || in_w_id_per_ch_blk >= in_w);
-            int filter_w_id = item_ch_id * filter_w + w;
-            input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id));
-            filter = read_imageh(filter_image, sampler, (int2)(filter_w_id, h));
-
-            output = mad(input, filter, output);
-            in_w_id_per_ch_blk++;
-        }
-        in_h_id_per_batch++;
-    }
-
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output = mad(scale, output, biase);
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-
-    write_imageh(output_image, (int2)(out_w_id, item_h_id), output);
-}
-
-
-/* batch == 1 pad(output) == 1 out_w % 2 == 0 */
-__kernel void conv_transpose3x3s2(__private const int item_ch,
-                                  __private const int item_w,
-                                  __private const int item_h,
-                                  __read_only image2d_t input_image,
-                                  __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-        __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-__read_only image2d_t new_scale,
-                                              __read_only image2d_t new_biase,
-#endif
-                                  __write_only image2d_t output_image,
-                                  __private const int stride,
-                                  __private const int pad,
-                                  __private const int dilation,
-                                  __private const int in_ch,
-                                  __private const int in_w,
-                                  __private const int in_h,
-                                  __private const int out_w,
-                                  __private const int out_h,
-                                  __private const int filter_w,
-                                  __private const int filter_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_id
-    int out_w_id_per_ch_blk = item_w_id / 2 * 10 + item_w_id % 2;
-    int out_h_id = item_h_id;
-    int out_w_id0 = item_ch_id * out_w + out_w_id_per_ch_blk;
-    int out_w_id1 = out_w_id0 + 2;
-    int out_w_id2 = out_w_id1 + 2;
-    int out_w_id3 = out_w_id2 + 2;
-    int out_w_id4 = out_w_id3 + 2;
-
-    // in_id
-    int in_w_id_per_ch_blk = (out_w_id_per_ch_blk) / 2;
-    in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
-    int in_h_id_per_batch = (out_h_id) / 2;
-    in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
-
-    // filter_id
-    int align_w_i = out_w_id_per_ch_blk - 1;
-    int align_w = align_w_i % 2 > 0 ?
-                  align_w_i % 2 - 2 : align_w_i % 2;
-    int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + 1 < 3 ? out_w_id_per_ch_blk + 1 : 2 + align_w;
-
-    int align_h_i = out_h_id - 1;
-    int align_h = align_h_i % 2 > 0 ?
-                  align_h_i % 2 - 2 : align_h_i % 2;
-    int filter_h_id_per_out_ch = out_h_id + 1 < 3 ? out_h_id + 1 : 2 + align_h;
-
-#ifdef BIASE_CH
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-    output[1] = output[0];
-    output[2] = output[0];
-    output[3] = output[0];
-    output[4] = output[0];
-
-#elif defined(BIASE_ELE)
-    half4 output[5];
-    output[0] = read_imageh(bias, sampler, (int2)(out_w_id0, item_h_id));
-    if (out_w_id_per_ch_blk + 2 < out_w) {
-        output[1] = read_imageh(bias, sampler, (int2)(out_w_id1, item_h_id));
-    }
-    if (out_w_id_per_ch_blk + 4 < out_w) {
-        output[2] = read_imageh(bias, sampler, (int2)(out_w_id2, item_h_id));
-    }
-    if (out_w_id_per_ch_blk + 6 < out_w) {
-        output[3] = read_imageh(bias, sampler, (int2)(out_w_id3, item_h_id));
-    }
-    if (out_w_id_per_ch_blk + 8 < out_w) {
-        output[4] = read_imageh(bias, sampler, (int2)(out_w_id4, item_h_id));
-    }
-
-#else
-    half4 output[5] = {0.0f};
-#endif
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-
-    half4 input[5] = {0.0f};
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int filter_w_id = ch * 3;
-        int h_idx = 0;
-        for (int h = filter_h_id_per_out_ch; h >= 0; h -= 2) {
-            int in_h_id = select(in_h_id_per_batch + h_idx, -1,
-                                 in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h);
-            int filter_h_id = item_ch_id * 12 + h;
-            int w_idx = 0;
-            for (int w = filter_w_id_per_ch_blk; w >= 0; w -= 2) {
-                int in_w_id0 = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1,
-                                     in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w);
-                int in_w_id1 = select(ch * in_w + in_w_id_per_ch_blk + 1 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 1 + w_idx < 0 || in_w_id_per_ch_blk + 1 + w_idx >= in_w);
-                int in_w_id2 = select(ch * in_w + in_w_id_per_ch_blk + 2 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 2 + w_idx < 0 || in_w_id_per_ch_blk + 2 + w_idx >= in_w);
-                int in_w_id3 = select(ch * in_w + in_w_id_per_ch_blk + 3 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 3 + w_idx < 0 || in_w_id_per_ch_blk + 3 + w_idx >= in_w);
-                int in_w_id4 = select(ch * in_w + in_w_id_per_ch_blk + 4 + w_idx, -1,
-                                     in_w_id_per_ch_blk + 4 + w_idx < 0 || in_w_id_per_ch_blk + 4 + w_idx >= in_w);
-
-                input[0] = read_imageh(input_image, sampler, (int2)(in_w_id0, in_h_id));
-                input[1] = read_imageh(input_image, sampler, (int2)(in_w_id1, in_h_id));
-                input[2] = read_imageh(input_image, sampler, (int2)(in_w_id2, in_h_id));
-                input[3] = read_imageh(input_image, sampler, (int2)(in_w_id3, in_h_id));
-                input[4] = read_imageh(input_image, sampler, (int2)(in_w_id4, in_h_id));
-
-                filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id));      // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3));  // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 6));  // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 9));  // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);             // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);             // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);             // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);             // in_ch:3,out_ch:0-3
-
-                output[0] = mad(input[0].x, filter_trans[0], output[0]);
-                output[0] = mad(input[0].y, filter_trans[1], output[0]);
-                output[0] = mad(input[0].z, filter_trans[2], output[0]);
-                output[0] = mad(input[0].w, filter_trans[3], output[0]);
-
-                output[1] = mad(input[1].x, filter_trans[0], output[1]);
-                output[1] = mad(input[1].y, filter_trans[1], output[1]);
-                output[1] = mad(input[1].z, filter_trans[2], output[1]);
-                output[1] = mad(input[1].w, filter_trans[3], output[1]);
-
-                output[2] = mad(input[2].x, filter_trans[0], output[2]);
-                output[2] = mad(input[2].y, filter_trans[1], output[2]);
-                output[2] = mad(input[2].z, filter_trans[2], output[2]);
-                output[2] = mad(input[2].w, filter_trans[3], output[2]);
-
-                output[3] = mad(input[3].x, filter_trans[0], output[3]);
-                output[3] = mad(input[3].y, filter_trans[1], output[3]);
-                output[3] = mad(input[3].z, filter_trans[2], output[3]);
-                output[3] = mad(input[3].w, filter_trans[3], output[3]);
-
-                output[4] = mad(input[4].x, filter_trans[0], output[4]);
-                output[4] = mad(input[4].y, filter_trans[1], output[4]);
-                output[4] = mad(input[4].z, filter_trans[2], output[4]);
-                output[4] = mad(input[4].w, filter_trans[3], output[4]);
-                w_idx++;
-            }
-            h_idx++;
-        }
-    }
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output[0] = mad(scale, output[0], biase);
-    if (out_w_id_per_ch_blk + 2 < out_w) {
-        output[1] = mad(scale, output[1], biase);
-    }
-    if (out_w_id_per_ch_blk + 4 < out_w) {
-        output[2] = mad(scale, output[2], biase);
-    }
-    if (out_w_id_per_ch_blk + 6 < out_w) {
-        output[3] = mad(scale, output[3], biase);
-    }
-    if (out_w_id_per_ch_blk + 8 < out_w) {
-        output[4] = mad(scale, output[4], biase);
-    }
-#endif
-
-#ifdef RELU
-    output[0] = activation(output[0]);
-    output[1] = activation(output[1]);
-    output[2] = activation(output[2]);
-    output[3] = activation(output[3]);
-    output[4] = activation(output[4]);
-
-#endif
-
-    write_imageh(output_image, (int2)(out_w_id0, item_h_id), output[0]);
-
-    if (out_w_id_per_ch_blk + 2 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id1, item_h_id), output[1]);
-    }
-    if (out_w_id_per_ch_blk + 4 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id2, item_h_id), output[2]);
-    }
-    if (out_w_id_per_ch_blk + 6 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id3, item_h_id), output[3]);
-    }
-    if (out_w_id_per_ch_blk + 8 < out_w) {
-        write_imageh(output_image, (int2)(out_w_id4, item_h_id), output[4]);
-    }
-}
-
-__kernel void conv_transpose(__private const int item_ch,
-                                  __private const int item_w,
-                                  __private const int item_h,
-                                  __read_only image2d_t input_image,
-                                  __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
-                                  __read_only image2d_t bias,
-#endif
-#ifdef BATCH_NORM
-                                  __read_only image2d_t new_scale,
-                                  __read_only image2d_t new_biase,
-#endif
-                                  __write_only image2d_t output_image,
-                                  __private const int stride,
-                                  __private const int pad,
-                                  __private const int dilation,
-                                  __private const int in_ch,
-                                  __private const int in_w,
-                                  __private const int in_h,
-                                  __private const int out_w,
-                                  __private const int out_h,
-                                  __private const int filter_w,
-                                  __private const int filter_h) {
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                              CLK_ADDRESS_CLAMP          |
-                              CLK_FILTER_NEAREST;
-    // item_id
-    const int item_ch_id = get_global_id(0);
-    const int item_w_id = get_global_id(1);
-    const int item_h_id = get_global_id(2);
-
-    // out_id
-    int out_b_id = item_h_id / out_h;
-    int out_w_id_per_ch_blk = item_w_id;
-    int out_h_id_per_batch = item_h_id % out_h;
-    int out_w_id = item_ch_id * out_w + out_w_id_per_ch_blk;
-
-    // in_id
-    int in_w_id_per_ch_blk = (out_w_id_per_ch_blk + pad - filter_w + stride) / stride;
-    in_w_id_per_ch_blk = in_w_id_per_ch_blk > 0 ? in_w_id_per_ch_blk : 0;
-    int in_h_id_per_batch = (out_h_id_per_batch + pad - filter_h + stride) / stride;
-    in_h_id_per_batch = in_h_id_per_batch > 0 ? in_h_id_per_batch : 0;
-
-    // filter_id
-    int align_w_i = out_w_id_per_ch_blk + pad - filter_w + 1;
-    int align_w = align_w_i % stride > 0 ?
-                  align_w_i % stride - stride : align_w_i % stride;
-    int filter_w_id_per_ch_blk = out_w_id_per_ch_blk + pad < filter_w ? out_w_id_per_ch_blk + pad : filter_w + align_w - 1;
-
-    int align_h_i = out_h_id_per_batch + pad - filter_h + 1;
-    int align_h = align_h_i % stride > 0 ?
-                  align_h_i % stride - stride : align_h_i % stride;
-    int filter_h_id_per_out_ch = out_h_id_per_batch + pad < filter_h ? out_h_id_per_batch + pad : filter_h + align_h - 1;
-
-#ifdef BIASE_CH
-    half4 output;
-    output = read_imageh(bias, sampler, (int2)(item_ch_id, 0));
-#elif defined(BIASE_ELE)
-    half4 output;
-    output = read_imageh(bias, sampler, (int2)(out_w_id, item_h_id));
-#else
-    half4 output = 0.0f;
-#endif
-    half4 filter[4] = {0.0f};
-    half4 filter_trans[4] = {0.0f};
-
-    half4 input = 0.0f;
-    for (int ch = 0; ch < (in_ch + 3) / 4; ch++) {
-        int filter_w_id = ch * filter_w;
-        int h_idx = 0;
-        for (int h = filter_h_id_per_out_ch; h >= 0; h -= stride) {
-            int in_h_id = select(in_h_id_per_batch + h_idx, -1,
-                                 in_h_id_per_batch + h_idx < 0 || in_h_id_per_batch + h_idx >= in_h);
-            int filter_h_id = item_ch_id * filter_h * 4 + h;
-            int w_idx = 0;
-            for (int w = filter_w_id_per_ch_blk; w >= 0; w -= stride) {
-                int in_w_id = select(ch * in_w + in_w_id_per_ch_blk + w_idx, -1,
-                                     in_w_id_per_ch_blk + w_idx < 0 || in_w_id_per_ch_blk + w_idx >= in_w);
-                input = read_imageh(input_image, sampler, (int2)(in_w_id, in_h_id));
-                filter[0] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id));                 // in_ch:0-3,out_ch:0
-                filter[1] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + filter_h));      // in_ch:0-3,out_ch:1
-                filter[2] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 2 * filter_h));  // in_ch:0-3,out_ch:2
-                filter[3] = read_imageh(filter_image, sampler, (int2)(filter_w_id + w, filter_h_id + 3 * filter_h));  // in_ch:0-3,out_ch:3
-
-                filter_trans[0] = (half4)(filter[0].x, filter[1].x, filter[2].x, filter[3].x);             // in_ch:0,out_ch:0-3
-                filter_trans[1] = (half4)(filter[0].y, filter[1].y, filter[2].y, filter[3].y);             // in_ch:1,out_ch:0-3
-                filter_trans[2] = (half4)(filter[0].z, filter[1].z, filter[2].z, filter[3].z);             // in_ch:2,out_ch:0-3
-                filter_trans[3] = (half4)(filter[0].w, filter[1].w, filter[2].w, filter[3].w);             // in_ch:3,out_ch:0-3
-
-                output = mad(input.x, filter_trans[0], output);
-                output = mad(input.y, filter_trans[1], output);
-                output = mad(input.z, filter_trans[2], output);
-                output = mad(input.w, filter_trans[3], output);
-                w_idx++;
-            }
-            h_idx++;
-        }
-    }
-#ifdef BATCH_NORM
-    half4 scale = read_imageh(new_scale, sampler, (int2)(item_ch_id, 0));
-    half4 biase = read_imageh(new_biase, sampler, (int2)(item_ch_id, 0));
-    output = mad(scale, output, biase);
-#endif
-
-#ifdef RELU
-    output = activation(output);
-#endif
-    write_imageh(output_image, (int2)(out_w_id, item_h_id), output);
-}
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
deleted file mode 100644
index ff5daa8d011f9aae4ebce4f2d65371c51fc1c56a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/density_prior_box_kernel.cl
+++ /dev/null
@@ -1,114 +0,0 @@
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define MIN_VALUE -FLT_MAX
-__kernel void density_prior_box(__write_only image2d_t output_boxes,
-                                __write_only image2d_t output_variances,
-                                __global float *densities,
-                                __private const float step_h,
-                                __private const float step_w,
-                                __private float variances0,
-                                __private float variances1,
-                                __private float variances2,
-                                __private float variances3,
-                                __private float offset,
-                                __private int den_and_fix_size,
-                                __private int img_width,
-                                __private int img_height,
-                                __private int C,
-                                __private int num_density,
-                                __private int step_average,
-                                __private int input_width,
-                                __private int wid,
-                                __private int fix_ratio_size
-                                ){
-
-                                const int out_c = get_global_id(0);
-                                const int out_w = get_global_id(1);
-                                const int out_nh = get_global_id(2);
-                                int2 output_pos;
-                                output_pos.x = out_c * 4 + out_w;
-                                output_pos.y = out_nh;
-                                half4 output;
-                                half4 variances;
-                                for (int c = 0; c < 4; c++) {
-                                    int idx = out_nh % num_density;
-                                    int input_h = out_nh / num_density;
-                                    int input_w = out_c * 4 + c;
-                                    int density_idx;
-                                    int density;
-                                    int ratio_idx;
-                                    int density_i;
-                                    int density_j;
-                                    int sum = 0;
-                                    int pre_sum = 0;
-                                    for (int i = 0; i < den_and_fix_size; i++) {
-                                        pre_sum = sum;
-                                        density = densities[i];
-                                        sum += density * density * fix_ratio_size;
-                                        if (idx < sum) {
-                                            density_idx = i;
-                                            break;
-                                        }
-                                    }
-                                    idx = idx - pre_sum;
-                                    ratio_idx = idx / (density * density);
-                                    idx = idx % (density * density);
-                                    density_i = idx / density;
-                                    density_j = idx % density;
-                                    half fixed_size = densities[den_and_fix_size + density_idx];
-                                    half ratio = densities[2 * den_and_fix_size + ratio_idx];
-                                    half box_width = fixed_size * ratio;
-                                    half box_height = fixed_size / ratio;
-                                    int shift = step_average / density;
-                                    half center_x;
-                                    half center_y;
-                                    center_x = (input_w + offset) * step_w;
-                                    center_x = center_x - step_average / 2.0 + shift / 2.0;
-                                    center_x = center_x + density_j * shift;
-                                    center_y = (input_h + offset) * step_h;
-                                    center_y = center_y - step_average / 2.0 + shift / 2.0;
-                                    center_y = center_y + density_i * shift;
-                                    half4 box;
-                                    box.x = (center_x - box_width / 2.0) / img_width;
-                                    box.y = (center_y - box_height / 2.0) / img_height;
-                                    box.z = (center_x + box_width / 2.0) / img_width;
-                                    box.w = (center_y + box_height / 2.0) / img_height;
-                                    box.x = max((float)box.x, 0.0);
-                                    box.y = max((float)box.y, 0.0);
-                                    box.z = min((float)box.z, 1.0);
-                                    box.w = min((float)box.w, 1.0);
-                                    half res;
-                                    half var;
-                                    if (out_w == 0) {
-                                        res = box.x;
-                                        var = convert_half(variances0);
-                                    } else if (out_w == 1) {
-                                        res = box.y;
-                                        var = convert_half(variances1);
-                                    } else if (out_w == 2) {
-                                        res = box.z;
-                                        var = convert_half(variances2);
-                                    } else if (out_w == 3) {
-                                        res = box.w;
-                                        var = convert_half(variances3);
-                                    }
-                                    variances.x = var;
-                                    variances.y = var;
-                                    variances.z = var;
-                                    variances.w = var;
-                                    if (c == 0) {
-                                        output.x = res;
-                                    } else if (c == 1) {
-                                        output.y = res;
-                                    } else if (c == 2) {
-                                        output.z = res;
-                                    } else if (c == 3) {
-                                        output.w = res;
-                                    }
-                                }
-
-                                write_imageh(output_boxes, (int2)(output_pos.x, output_pos.y), output);
-
-                                write_imageh(output_variances, (int2)(output_pos.x, output_pos.y), variances);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
deleted file mode 100644
index 3c3497f917d8a16c7c7e304edf00a4250066dce7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_add_bn_relu_kernel.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#define BIASE
-#define BATCH_NORM
-#define RELU
-#include "conv_kernel.inc.cl"
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
deleted file mode 100644
index 2a5c823295c7562361433414cf35be81d2fbf00c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/depthwise_conv_kernel.cl
+++ /dev/null
@@ -1,15 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "conv_kernel.inc.cl"
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
deleted file mode 100644
index fc9dfc872691f9fb0ff4d547c78a4a3408197302..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void dropout(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W,
-                      __private const float dropoutPro) {
-
-                       const int out_c = get_global_id(0);
-                       const int out_w = get_global_id(1);
-                       const int out_nh = get_global_id(2);
-
-                       int2 output_pos;
-                       output_pos.x = out_c * out_W + out_w;
-                       output_pos.y = out_nh;
-
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                 CLK_ADDRESS_CLAMP      |
-                                                 CLK_FILTER_NEAREST;
-                       half4 input;
-                       half4 output;
-
-                       input = read_imageh(input_image, sampler,output_pos);
-                       half4 dropout = (half4)(1 - dropoutPro);
-                       output =  dropout * input;
-
-                       write_imageh(output_image, output_pos, output);
-}
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
deleted file mode 100644
index f304764868959ce028a8448c4d311db878cc1f6e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_add_kernel.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_add(__global image2d_t input, __global image2d_t bias,__write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 in = read_imageh(input, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = in + biase;
-     write_imageh(outputImage,coords,output);
- }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
deleted file mode 100644
index 916dd9d49fe3b373a5c54f5a1f5fec5c24e91b14..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_mul(__global image2d_t input, __global image2d_t bias,
-                              __write_only image2d_t outputImage) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-  half4 in = read_imageh(input, sampler, coords);
-  half4 biase = read_imageh(bias, sampler, coords);
-  half4 output = in * biase;
-  write_imageh(outputImage, coords, output);
-}
-
-__kernel void channel_mul(__global image2d_t input, __global image2d_t bias,
-                          __write_only image2d_t outputImage, int w) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-  int2 coords_bias;
-  coords_bias.x = x / w;
-  coords_bias.y = 0;
-  half4 in = read_imageh(input, sampler, coords);
-  half4 biase = read_imageh(bias, sampler, coords_bias);
-  half4 output = in * biase;
-  write_imageh(outputImage, coords, output);
-}
-
-// etc : 1 1 1 72
-// run time Y  [value,0,0,0] * 72
-__kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
-                             __write_only image2d_t outputImage, int w) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-
-  int2 coords_bias0;
-  int2 coords_bias1;
-  int2 coords_bias2;
-  int2 coords_bias3;
-
-  /*  if (x == 0 && y == 0) {
-      half4 b = (half4){0, 0, 0, 0};
-  #define PPI(j, k)                                                          \
-    b = read_imageh(bias, sampler, (int2){j, k});                            \
-    printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
-           convert_float(b.y), convert_float(b.z), convert_float(b.w));
-      for (int i = 0; i < 73; ++i) {
-        PPI(i, 0);
-      }
-  #undef PPI
-    }*/
-
-  coords_bias0.x = x / w * 4;
-  coords_bias0.y = 0;
-
-  coords_bias1.x = x / w * 4 + 1;
-  coords_bias1.y = 0;
-
-  coords_bias2.x = x / w * 4 + 2;
-  coords_bias2.y = 0;
-
-  coords_bias3.x = x / w * 4 + 3;
-  coords_bias3.y = 0;
-
-  half4 biase0 = read_imageh(bias, sampler, coords_bias0);
-  half4 biase1 = read_imageh(bias, sampler, coords_bias1);
-  half4 biase2 = read_imageh(bias, sampler, coords_bias2);
-  half4 biase3 = read_imageh(bias, sampler, coords_bias3);
-  /*  if (x == 0 && y == 0) {
-      printf("bias0={ %f , %f , %f , %f }\n ",
-             convert_float(biase0.x), convert_float(biase0.y),
-             convert_float(biase0.z), convert_float(biase0.w));
-
-      printf("bias1={ %f , %f , %f , %f }\n ",
-             convert_float(biase1.x), convert_float(biase1.y),
-             convert_float(biase1.z), convert_float(biase1.w));
-      printf("bias2={ %f , %f , %f , %f }\n ",
-             convert_float(biase2.x), convert_float(biase2.y),
-             convert_float(biase2.z), convert_float(biase2.w));
-      printf("bias3={ %f , %f , %f , %f }\n ",
-             convert_float(biase3.x), convert_float(biase3.y),
-             convert_float(biase3.z), convert_float(biase3.w));
-    }*/
-  half4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
-  half4 in = read_imageh(input, sampler, coords);
-  half4 output = mad(in, biase, 0);
-  write_imageh(outputImage, coords, output);
-}
-
-// c 1 1
-__kernel void channel_mul_d3(__global image2d_t input, __global image2d_t bias,
-                          __write_only image2d_t outputImage, int w) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-  int2 coords_bias;
-  coords_bias.x = x / w;
-  coords_bias.y = 0;
-  half4 in = read_imageh(input, sampler, coords);
-  half4 biase = read_imageh(bias, sampler, coords_bias);
-  half4 output = in * biase;
-  write_imageh(outputImage, coords, output);
-}
-
-__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias,
-                          __write_only image2d_t outputImage, int w) {
-  int x = get_global_id(0);
-  int y = get_global_id(1);
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int2 coords;
-  coords.x = x;
-  coords.y = y;
-  int2 coords_bias;
-  coords_bias.x = x / w;
-  coords_bias.y = 0;
-  half4 in = read_imageh(input, sampler, coords);
-  half4 biase = read_imageh(bias, sampler, coords_bias);
-  half4 output = in * biase;
-  write_imageh(outputImage, coords, output);
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
deleted file mode 100644
index 1f62ff377a7f8fddaeae108a8cfaa6d98847f9af..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
-     int x = get_global_id(0);
-     int y = get_global_id(1);
-     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-     int2 coords;
-     coords.x = x;
-     coords.y = y;
-     half4 input = read_imageh(inputImage, sampler, coords);
-     half4 biase = read_imageh(bias, sampler, coords);
-     half4 output = input - biase;
-     write_imageh(outputImage, coords, output);
- }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
deleted file mode 100644
index 2227aaab47f3acef171d8a92a9b994f401d497a3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/exp_kernel.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
-#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
-
-__kernel void exp_impl(__read_only image2d_t input, __write_only image2d_t output) {
-   const int x = get_global_id(0);
-   const int y = get_global_id(1);
-
-   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                             CLK_ADDRESS_CLAMP |
-                             CLK_FILTER_NEAREST;
-
-   half4 in = read_imageh(input, sampler, (int2)(x, y));
-   half4 out;
-   out.x = pow(2.71828182, (float)(in.x));
-   out.y = pow(2.71828182, (float)(in.y));
-   out.z = pow(2.71828182, (float)(in.z));
-   out.w = pow(2.71828182, (float)(in.w));
-   write_imageh(output, (int2)(x, y), out);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
deleted file mode 100644
index 8c74477b6abca02d81cb38db2412ee55175f642c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void expend_c1(
-    __private const int OUT_C, __private const int OUT_W,
-    __private const int OUT_NH,
-
-    __private const int IN_C, __private const int IN_W,
-    __private const int IN_NH,
-
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-
-    __read_only image2d_t input, __write_only image2d_t output,
-    __private const int n_times, __private const int c_times,
-    __private const int h_times, __private const int w_times) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
-    return;
-  }
-
-  const int out_n = out_nh / output_height;
-  const int out_h = out_nh % output_height;
-
-  //  const real_in_c = out_c * 4 / c_times;
-  //  const int in_c = real_in_c / 4;
-  const int in_c = 0;
-
-  //  const int in_c = out_c / c_times;
-  const int in_w = out_w / w_times;
-
-  const int in_h = out_h / h_times;
-  const int in_n = out_n / n_times;
-  const int in_nh = in_n * input_height + in_h;
-
-  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
-  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, input_pos);
-  in.y = in.x;
-  in.z = in.x;
-  in.w = in.x;
-  write_imageh(output, output_pos, in);
-}
-
-__kernel void expend_c2(
-    __private const int OUT_C, __private const int OUT_W,
-    __private const int OUT_NH,
-
-    __private const int IN_C, __private const int IN_W,
-    __private const int IN_NH,
-
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-
-    __read_only image2d_t input, __write_only image2d_t output,
-    __private const int n_times, __private const int c_times,
-    __private const int h_times, __private const int w_times) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
-    return;
-  }
-
-  const int out_n = out_nh / output_height;
-  const int out_h = out_nh % output_height;
-
-  //  const real_in_c = out_c * 4 / c_times;
-  //  const int in_c = real_in_c / 4;
-  const int in_c = 0;
-
-  //  const int in_c = out_c / c_times;
-  const int in_w = out_w / w_times;
-
-  const int in_h = out_h / h_times;
-  const int in_n = out_n / n_times;
-  const int in_nh = in_n * input_height + in_h;
-
-  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
-  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, input_pos);
-  in.z = in.x;
-  in.w = in.y;
-  write_imageh(output, output_pos, in);
-}
-
-
-__kernel void expend_c4(
-    __private const int OUT_C, __private const int OUT_W,
-    __private const int OUT_NH,
-
-    __private const int IN_C, __private const int IN_W,
-    __private const int IN_NH,
-
-    __private const int input_width,  /* of one block */
-    __private const int input_height, /* of one block */
-    __private const int output_width, __private const int output_height,
-
-    __read_only image2d_t input, __write_only image2d_t output,
-    __private const int n_times, __private const int c_times,
-    __private const int h_times, __private const int w_times) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
-    return;
-  }
-
-  const int out_n = out_nh / output_height;
-  const int out_h = out_nh % output_height;
-
-  //  const real_in_c = out_c * 4 / c_times;
-  //  const int in_c = real_in_c / 4;
-  const int in_c = 0;
-
-  //  const int in_c = out_c / c_times;
-  const int in_w = out_w / w_times;
-
-  const int in_h = out_h / h_times;
-  const int in_n = out_n / n_times;
-  const int in_nh = in_n * input_height + in_h;
-
-  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
-  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, input_pos);
-  write_imageh(output, output_pos, in);
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
deleted file mode 100644
index 27ca4d296e786715345540c44b6691dc8a69cefe..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/feed_kernel.cl
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void feed(__global float *in,
-                   __write_only image2d_t output_image,
-                   __private const int out_H,
-                   __private const int out_W,
-                   __private const int out_C,
-                   __private const int Stride0,
-                   __private const int Stride1,
-                   __private const int Stride2){
-
-            const int out_c = get_global_id(0);
-            const int out_w = get_global_id(1);
-            const int out_nh = get_global_id(2);
-            const int out_n = out_nh/out_H;
-            const int out_h = out_nh%out_H;
-
-            const int in_n = out_n;
-            const int in_c0 = out_c * 4 + 0;
-            const int in_c1 = out_c * 4 + 1;
-            const int in_c2 = out_c * 4 + 2;
-            const int in_c3 = out_c * 4 + 3;
-            const int in_h = out_h;
-            const int in_w = out_w;
-
-
-            int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
-            int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
-            int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
-            int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
-
-            int2 output_pos;
-            output_pos.x = out_c * out_W + out_w;
-            output_pos.y = out_nh;
-
-            half4 output = (half4)0.0f;
-            output.x = convert_half(in[input_pos0]);
-            if(out_C - 4 * out_c>=2){
-             output.y = convert_half(in[input_pos1]);
-            }
-            if(out_C - 4 * out_c>=3){
-            output.z = convert_half(in[input_pos2]);
-            }
-            if(out_C - 4 * out_c>=4){
-             output.w = convert_half(in[input_pos3]);
-            }
-            write_imageh(output_image, output_pos, output);
-
- }
-
-__kernel void feed_with_pre(__global uchar *in,
-    __write_only image2d_t output_image,
-    __private const int out_H,
-    __private const int out_W,
-    __private const int out_C,
-    __private const int Stride0,
-    __private const int Stride1,
-    __private const int Stride2){
-
-    const int out_c = get_global_id(0);
-    const int out_w = get_global_id(1);
-    const int out_nh = get_global_id(2);
-    const int out_n = out_nh/out_H;
-    const int out_h = out_nh%out_H;
-
-    const int in_n = out_n;
-    const int in_c0 = out_c * 4 + 0;
-    const int in_c1 = out_c * 4 + 1;
-    const int in_c2 = out_c * 4 + 2;
-    const int in_c3 = out_c * 4 + 3;
-    const int in_h = out_h;
-    const int in_w = out_w;
-
-
-    int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
-    int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
-    int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
-    int input_pos3 = in_n * Stride2 + in_c3 * Stride1 + in_h * Stride0 + in_w;
-
-    int2 output_pos;
-    output_pos.x = out_c * out_W + out_w;
-    output_pos.y = out_nh;
-
-    half4 output = (half4)0.0f;
-    output.x = convert_half(in[input_pos0]) / 255;
-    if(out_C - 4 * out_c>=2){
-        output.y = convert_half(in[input_pos1]) / 255;
-    }
-    if(out_C - 4 * out_c>=3){
-        output.z = convert_half(in[input_pos2]) / 255;
-    }
-    if(out_C - 4 * out_c>=4){
-        output.w = convert_half(in[input_pos3]) / 255;
-    }
-    write_imageh(output_image, output_pos, output);
-
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
deleted file mode 100644
index f6b8e23cc43f512d5112d6bc80c6e1199d7c8c5e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/fetch_kernel.cl
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void fetch(__private const int in_height,
-                    __private const int in_width,
-                    __read_only image2d_t input,
-                    __global float* out,
-                    __private const int size_ch,
-                    __private const int size_block,
-                    __private const int size_batch,
-                    __private const int C) {
-  const int in_c = get_global_id(0);
-  const int in_w = get_global_id(1);
-  const int in_nh = get_global_id(2);
-  const int in_n = in_nh / in_height;
-  const int in_h = in_nh % in_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const int pos_x = mad24(in_c, in_width, in_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
-
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
-  out[index] = convert_float(in.x);
-  if(C - 4 * in_c>=2){
-   out[index + size_ch] = convert_float(in.y);
-  }
-  if(C - 4 * in_c>=3){
-  out[index + size_ch * 2] = convert_float(in.z);
-  }
-
-  if(C - 4 * in_c>=4){
-   out[index + size_ch * 3] = convert_float(in.w);
-  }
-
-}
-
-__kernel void fetch_2d(__private const int in_height,
-                       __private const int in_width,
-                       __read_only image2d_t input,
-                       __global float* out) {
-  const int in_w = get_global_id(1);
-  const int in_h = get_global_id(2);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(in_w, in_h));
-
-  const int index = (in_h * in_width + in_w) * 4;
-  out[index] = convert_float(in.x);
-  out[index + 1] = convert_float(in.y);
-  out[index + 2] = convert_float(in.z);
-  out[index + 3] = convert_float(in.w);
-}
-
-__kernel void fetch_with_post(__private const int in_height,
-    __private const int in_width,
-    __read_only image2d_t input,
-    __global uchar* out,
-    __private const int size_ch,
-    __private const int size_block,
-    __private const int size_batch,
-    __private const int C) {
-  const int in_c = get_global_id(0);
-  const int in_w = get_global_id(1);
-  const int in_nh = get_global_id(2);
-  const int in_n = in_nh / in_height;
-  const int in_h = in_nh % in_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  const int pos_x = mad24(in_c, in_width, in_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, in_nh));
-
-  const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
-  out[index] = convert_uchar_sat(in.x * 255);
-  if(C - 4 * in_c>=2){
-    out[index + size_ch] = convert_uchar_sat(in.y * 255);
-  }
-  if(C - 4 * in_c>=3){
-    out[index + size_ch * 2] = convert_uchar_sat(in.z * 255);
-  }
-
-  if(C - 4 * in_c>=4){
-    out[index + size_ch * 3] = convert_uchar_sat(in.w * 255);
-  }
-
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
deleted file mode 100644
index 337fc7ae62881ecabdd2e0f2d9f4df20a310dfd3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/flatten2_kernel.cl
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-
-__kernel void flatten2(__read_only image2d_t input_img,
-                       __write_only image2d_t output_img,
-                       __private int out_width,
-                       __private int in_width,
-                       __private int in_height,
-                       __private int in_C
-                      ){
-
-                        const int out_c = get_global_id(0);
-                        const int out_w = get_global_id(1);
-                        const int out_nh = get_global_id(2);
-
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                    CLK_ADDRESS_CLAMP |
-                                                    CLK_FILTER_NEAREST;
-
-                        int2 output_pos;
-                        output_pos.x = out_c * out_width + out_w;
-                        output_pos.y = out_nh;
-
-                        int channel_size = in_width * in_height;
-
-                        int in_c = output_pos.x / channel_size / 4;
-                        int2 input_pos;
-                        input_pos.x = (output_pos.x % in_width) + (in_c * in_width);
-                        input_pos.y = (output_pos.x % channel_size) / in_width + out_nh * in_height;
-                        half4 input_data = read_imageh(input_img, sampler, input_pos);
-
-                        half4 output_data;
-                        int in_c_offset = output_pos.x / channel_size % 4;
-                        if(in_c_offset == 0){
-                            output_data.x = input_data.x;
-                        } else if(in_c_offset == 1){
-                            output_data.x = input_data.y;
-                        } else if(in_c_offset == 2){
-                            output_data.x = input_data.z;
-                        } else if(in_c_offset == 3){
-                            output_data.x = input_data.w;
-                        }
-
-                        write_imageh(output_img, output_pos, output_data);
-}
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
deleted file mode 100644
index 0512ce9beab00f1d9b8036d65385a14743ff7e31..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
+++ /dev/null
@@ -1,99 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cl_common.h"
-
-__kernel void grid_sampler(__private const int out_height,
-                           __private const int out_width,
-                           __read_only image2d_t input,
-                           __read_only image2d_t grid,
-                           __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2) * 4;
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int x_grid = out_h / 4 * 2;
-  int y_grid = out_n * out_width + out_w;
-  float4 g1 = read_imagef(grid, sampler, (int2)(x_grid, y_grid));
-  float4 g2 = read_imagef(grid, sampler, (int2)(x_grid + 1, y_grid));
-
-  float x = (g1.x + 1) * (out_width - 1) / 2;
-  float y = (g2.x + 1) * (out_height - 1) / 2;
-  float x0 = floor(x);
-  float y0 = floor(y);
-  int x_p = out_c * out_width + x0;
-  int y_p = out_n * out_height + y0;
-  int x_out = out_c * out_width + out_w;
-  int y_out = out_n * out_height + out_h;
-  float4 input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
-  float4 input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
-  float4 input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
-  float4 input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
-  float4 out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
-                                      input1 * (x - x0) * (y0 + 1 - y) +
-                                      input2 * (x0 + 1 - x) * (y - y0) +
-                                      input3 * (x - x0) * (y - y0);
-  write_imageh(output, (int2)(x_out, y_out), convert_half4(out_val));
-
-  x = (g1.y + 1) * (out_width - 1) / 2;
-  y = (g2.y + 1) * (out_height - 1) / 2;
-  x0 = floor(x);
-  y0 = floor(y);
-  x_p = out_c * out_width + x0;
-  y_p = out_n * out_height + y0;
-  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
-  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
-  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
-  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
-  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
-                                      input1 * (x - x0) * (y0 + 1 - y) +
-                                      input2 * (x0 + 1 - x) * (y - y0) +
-                                      input3 * (x - x0) * (y - y0);
-  write_imageh(output, (int2)(x_out, y_out + 1), convert_half4(out_val));
-
-  x = (g1.z + 1) * (out_width - 1) / 2;
-  y = (g2.z + 1) * (out_height - 1) / 2;
-  x0 = floor(x);
-  y0 = floor(y);
-  x_p = out_c * out_width + x0;
-  y_p = out_n * out_height + y0;
-  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
-  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
-  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
-  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
-  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
-                                      input1 * (x - x0) * (y0 + 1 - y) +
-                                      input2 * (x0 + 1 - x) * (y - y0) +
-                                      input3 * (x - x0) * (y - y0);
-  write_imageh(output, (int2)(x_out, y_out + 2), convert_half4(out_val));
-
-  x = (g1.w + 1) * (out_width - 1) / 2;
-  y = (g2.w + 1) * (out_height - 1) / 2;
-  x0 = floor(x);
-  y0 = floor(y);
-  x_p = out_c * out_width + x0;
-  y_p = out_n * out_height + y0;
-  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
-  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
-  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
-  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
-  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
-                                      input1 * (x - x0) * (y0 + 1 - y) +
-                                      input2 * (x0 + 1 - x) * (y - y0) +
-                                      input3 * (x - x0) * (y - y0);
-  write_imageh(output, (int2)(x_out, y_out + 3), convert_half4(out_val));
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
deleted file mode 100644
index f78de05f766e12a54e35cd8cd59102435e1d950a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/instancenorm_kernel.cl
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cl_common.h"
-
-__kernel void instancenorm(__private const int in_width,
-                        __private const int in_height,
-                        __private const int in_c_group,
-                        __private const int local_work_size_x,
-                        __private const int local_work_size_y,
-                        __private const float epsilon,
-                        __read_only image2d_t input,
-                        __write_only image2d_t output) {
-  const int out_cn = get_global_id(0);
-  const int n = out_cn / in_c_group;
-  const int c = out_cn % in_c_group;
-  const int w = get_local_id(1);
-  const int h = get_local_id(2);
-  const int local_id = w * local_work_size_y + h;
-  const int local_total_size = local_work_size_x * local_work_size_y;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-#ifdef LOCAL_MEM_128
-  __local float4 shared_mem[128];
-#elif defined(LOCAL_MEM_64)
-  __local float4 shared_mem[64];
-#else
-  __local float4 shared_mem[256];
-#endif
-  int xOffset = c * in_width;
-  int yOffset = n * in_height;
-  float4 sum = 0.0f;
-  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
-    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex));
-    }
-  }
-  shared_mem[local_id] = sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id < 32) {
-    for (int i = local_id + 32; i < local_total_size; i += 32) {
-      sum += shared_mem[i];
-    }
-  }
-  shared_mem[local_id] += sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id == 0) {
-    int top = min(32, local_total_size);
-    for (int i = 0; i < top; i += 1) {
-      sum += shared_mem[i];
-    }
-    shared_mem[0] = sum / (in_width * in_height);
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  const float4 mean_val = shared_mem[0];
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
-    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val;
-      sum += temp * temp;
-    }
-  }
-  shared_mem[local_id] = sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id < 32) {
-    for (int i = local_id + 32; i < local_total_size; i += 32) {
-      sum += shared_mem[i];
-    }
-  }
-  shared_mem[local_id] += sum;
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  sum = 0.0f;
-  if (local_id == 0) {
-    int top = min(32, local_total_size);
-    for (int i = 0; i < top; i += 1) {
-      sum += shared_mem[i];
-    }
-    shared_mem[0] = sum / (in_width * in_height);
-  }
-
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon));
-
-  float4 s = 1 / sigma;
-
-  for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
-    for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
-      int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
-      float4 in_val = read_imagef(input, sampler, intout_pos);
-      half4 out_val = convert_half4((in_val - mean_val) * s);
-#ifdef RELU
-      out_val = activation(out_val);
-#endif
-      write_imageh(output, intout_pos, out_val);
-    }
-  }
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
deleted file mode 100644
index d8c01299285bcd3ac2b8f2071b136ee33c62bbe4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/leakyrelu_kernel.cl
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void leakyrelu(__read_only image2d_t input,
-    __write_only image2d_t output, __private const float alpha, __private const int dims_w) {
-    const int c = get_global_id(0);
-    const int w = get_global_id(1);
-    const int nh = get_global_id(2);
-    int2 input_pos;
-    input_pos.x = c * dims_w + w;
-    input_pos.y = nh;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-    half4 in = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
-
-    half4 output_data;
-    output_data.x = max((float)(in.x), (float)(alpha * (in.x)));
-    output_data.y = max((float)(in.y), (float)(alpha * (in.y)));
-    output_data.z = max((float)(in.z), (float)(alpha * (in.z)));
-    output_data.w = max((float)(in.w), (float)(alpha * (in.w)));
-
-    write_imageh(output, (int2)(input_pos.x, input_pos.y), output_data);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
deleted file mode 100644
index 080928b23586b0aa3e639a0cc9b5577355863639..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void lrn(__read_only image2d_t input_image,
-                        __write_only image2d_t output_image,
-                        __private const int out_C,
-                        __private const int out_W,
-                        __private const int n,
-                        __private const float k,
-                        __private const float alpha,
-                        __private const float beta){
-
-                        const int out_c = get_global_id(0);
-                        const int out_w = get_global_id(1);
-                        const int out_nh = get_global_id(2);
-
-                        const int out_c0 = out_c * 4;
-                        const int out_c1 = out_c * 4 + 1;
-                        const int out_c2 = out_c * 4+ 2;
-                        const int out_c3 = out_c * 4+ 3;
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                    CLK_ADDRESS_CLAMP |
-                                                    CLK_FILTER_NEAREST;
-
-                        const int start = -(n-1)/2;
-                        const end = start + n;
-                        float sqr_sum0 = 0.0f;
-                        float sqr_sum1 = 0.0f;
-                        float sqr_sum2 = 0.0f;
-                        float sqr_sum3 = 0.0f;
-                        int input_c0,input_c1,input_c2,input_c3;
-                        int2 input_pos0,input_pos1,input_pos2,input_pos3;
-                        float4 input0,input1,input2,input3;
-                        for(int i = start; i < end ;i++){
-                         if(out_c0 + i>=0&&out_c0 + i<out_C){
-                          input_c0 = (out_c0 + i)/4;
-                          input_pos0.x = input_c0 * out_W + out_w;
-                          input_pos0.y = out_nh;
-                          input0 = convert_float4(read_imageh(input_image, sampler,input_pos0));
-                          if((out_c0 + i)%4 == 0){
-                           sqr_sum0 += input0.x * input0.x;
-                          }else if((out_c0 + i)%4 == 1){
-                           sqr_sum0 += input0.y * input0.y;
-                          }else if((out_c0 + i)%4 == 2){
-                           sqr_sum0 += input0.z * input0.z;
-                          }else{
-                           sqr_sum0 += input0.w * input0.w;
-                          }
-                         }
-
-                       if(out_c1 + i>=0&&out_c1 + i<out_C){
-                          input_c1 = (out_c1 + i)/4;
-                          input_pos1.x = input_c1 * out_W + out_w;
-                          input_pos1.y = out_nh;
-                          input1 = convert_float4(read_imageh(input_image, sampler,input_pos1));
-                          if((out_c1 + i)%4 == 0){
-                           sqr_sum1 += input1.x * input1.x;
-                          }else if((out_c1 + i)%4 == 1){
-                           sqr_sum1 += input1.y * input1.y;
-                          }else if((out_c1 + i)%4 == 2){
-                           sqr_sum1 += input1.z * input1.z;
-                          }else{
-                           sqr_sum1 += input1.w * input1.w;
-                          }
-                         }
-
-
-                         if(out_c2 + i>=0&&out_c2 + i<out_C){
-                          input_c2 = (out_c2 + i)/4;
-                          input_pos2.x = input_c2 * out_W + out_w;
-                          input_pos2.y = out_nh;
-                          input2 = convert_float4(read_imageh(input_image, sampler,input_pos2));
-                          if((out_c2 + i)%4 == 0){
-                           sqr_sum2 += input2.x * input2.x;
-                          }else if((out_c2 + i)%4 == 1){
-                           sqr_sum2 += input2.y * input2.y;
-                          }else if((out_c2 + i)%4 == 2){
-                           sqr_sum2 += input2.z * input2.z;
-                          }else{
-                           sqr_sum2 += input2.w * input2.w;
-                          }
-                         }
-
-                         if(out_c3 + i>=0&&out_c3 + i<out_C){
-                          input_c3 = (out_c3 + i)/4;
-                          input_pos3.x = input_c3 * out_W + out_w;
-                          input_pos3.y = out_nh;
-                          input3 = convert_float4(read_imageh(input_image, sampler,input_pos3));
-                          if((out_c3 + i)%4 == 0){
-                           sqr_sum3 += input3.x * input3.x;
-                          }else if((out_c3 + i)%4 == 1){
-                           sqr_sum3 += input3.y * input3.y;
-                          }else if((out_c3 + i)%4 == 2){
-                           sqr_sum3 += input3.z * input3.z;
-                          }else{
-                           sqr_sum3 += input3.w * input3.w;
-                          }
-                         }
-
-                        }
-
-                        float4 output = (float4)0.0f;
-                        float4 input;
-                        int2 output_pos;
-                        output_pos.x = out_c * out_W + out_w;
-                        output_pos.y = out_nh;
-                        input = convert_float4(read_imageh(input_image, sampler,output_pos));
-
-                        output.x = input.x / (pow(k + alpha * (sqr_sum0),beta));
-
-                        if(out_C - 4 * out_c>=2){
-                        output.y = input.y / (pow(k + alpha * (sqr_sum1),beta));
-                        }
-                        if(out_C - 4 * out_c>=3){
-                        output.z = input.z / (pow(k + alpha * (sqr_sum2),beta));
-                        }
-                        if(out_C - 4 * out_c>=4){
-                        output.w = input.w / (pow(k + alpha * (sqr_sum3),beta));
-                        }
-                        half4 tmp = convert_half4(output);
-                        write_imageh(output_image, output_pos, tmp);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
deleted file mode 100644
index b74449d9c8a02551cd74d366849768b4a91a4dce..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/nearest_interp_kernel.cl
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void nearest_interp(__read_only image2d_t input, __write_only image2d_t output,
-                             __private const float scale_h, __private const float scale_w,
-                             __private const int in_dims_h, __private const int out_dims_h,
-                             __private const int in_dims_w, __private const int out_dims_w) {
-                             const int c = get_global_id(0);
-                             const int w = get_global_id(1);
-                             const int nh = get_global_id(2);
-                             int2 output_pos;
-                             output_pos.x = c * out_dims_w + w;
-                             output_pos.y = nh;
-                             int out_n = nh / out_dims_h;
-                             int out_h = nh % out_dims_h;
-                             int2 input_pos;
-                             input_pos.x = c * in_dims_w + w / scale_w;
-                             input_pos.y = out_n * in_dims_h + out_h / scale_h;
-
-                             const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                             CLK_ADDRESS_CLAMP |
-                                                             CLK_FILTER_NEAREST;
-                             half4 input_data = read_imageh(input, sampler, (int2)(input_pos.x, input_pos.y));
-                             write_imageh(output, (int2)(output_pos.x , output_pos.y), input_data);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
deleted file mode 100644
index 6d9142a16d6e4cfc708aa874d7bfb54d39d458e3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/pad2d_kernel.cl
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void pad2d(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_bottom,
-    __private const int pad_left, __private const int pad_right,
-    __private const int mode, __private const float pad_value,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  int2 output_pos = (int2)(mad24(out_c, out_width, out_w), out_nh);
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int x = out_w - pad_left;
-  int y = out_h - pad_top;
-
-  if (mode == 0) {
-    if (x < 0 || y < 0 || x >= in_width || y >= in_height) {
-      write_imageh(output, output_pos, (half4)(pad_value));
-    } else {
-      write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y)));
-    }
-  } else if (mode == 1) {
-      x = abs(x);
-      y = abs(y);
-      x = x < in_width ? x : 2 * in_width - 2 - x;
-      y = y < in_height ? y : 2 * in_height - 2 - y;
-      write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y)));
-  } else if (mode == 2) {
-      x = x > 0 ? x : 0;
-      x = x < in_width ? x : in_width - 1;
-      y = y > 0 ? y : 0;
-      y = y < in_height ? y : in_height - 1;
-      write_imageh(output, output_pos, read_imageh(input, sampler, (int2)(out_c * in_width + x, out_n * in_height + y)));
-  }
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl
deleted file mode 100644
index a38c1ceae0a0dd502bd4c133c1ce229006e6eba3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/pixel_shuffle_kernel.cl
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void pixel_shuffle(__read_only image2d_t input_image,
-                        __write_only image2d_t output_image,
-                        __private const int in_N,
-                        __private const int in_C,
-                        __private const int in_H,
-                        __private const int in_W,
-                        __private const int out_N,
-                        __private const int out_C,
-                        __private const int out_H,
-                        __private const int out_W,
-                        __private const int upscale_factor) {
-
-                        const int out_c4 = get_global_id(0);
-                        const int out_w = get_global_id(1);
-                        const int out_nh = get_global_id(2);
-
-                        int out_h = out_nh % out_H;
-                        int out_n = out_nh / out_H;
-
-                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                    CLK_ADDRESS_CLAMP |
-                                                    CLK_FILTER_NEAREST;
-                        
-                        int in_h = out_h / upscale_factor;
-                        int in_w = out_w / upscale_factor;
-                        int in_nh = out_n * in_H + in_h;
-
-                        half4 res;
-                        int out_c;
-                        int in_c;
-                        half4 in;
-                        int2 in_pos;
-
-                        out_c = out_c4 * 4 + 0;
-                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
-                        in_pos.x = (in_c / 4) * in_W + in_w;
-                        in_pos.y = in_nh;
-                        in = read_imageh(input_image, sampler, in_pos);
-                        if (in_c % 4 == 0) {
-                            res.x = in.x;
-                        } else if (in_c % 4 == 1) {
-                            res.x = in.y;
-                        } else if (in_c % 4 == 2) {
-                            res.x = in.z;
-                        } else if (in_c % 4 == 3) {
-                            res.x = in.w;
-                        }
-
-                        out_c = out_c4 * 4 + 1;
-                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
-                        in_pos.x = (in_c / 4) * in_W + in_w;
-                        in_pos.y = in_nh;
-                        in = read_imageh(input_image, sampler, in_pos);
-                        if (in_c % 4 == 0) {
-                            res.y = in.x;
-                        } else if (in_c % 4 == 1) {
-                            res.y = in.y;
-                        } else if (in_c % 4 == 2) {
-                            res.y = in.z;
-                        } else if (in_c % 4 == 3) {
-                            res.y = in.w;
-                        }
-
-                        out_c = out_c4 * 4 + 2;
-                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
-                        in_pos.x = (in_c / 4) * in_W + in_w;
-                        in_pos.y = in_nh;
-                        in = read_imageh(input_image, sampler, in_pos);
-                        if (in_c % 4 == 0) {
-                            res.z = in.x;
-                        } else if (in_c % 4 == 1) {
-                            res.z = in.y;
-                        } else if (in_c % 4 == 2) {
-                            res.z = in.z;
-                        } else if (in_c % 4 == 3) {
-                            res.z = in.w;
-                        }
-
-                        out_c = out_c4 * 4 + 3;
-                        in_c = out_c * upscale_factor * upscale_factor + (out_h % upscale_factor) * upscale_factor + (out_w % upscale_factor);
-                        in_pos.x = (in_c / 4) * in_W + in_w;
-                        in_pos.y = in_nh;
-                        in = read_imageh(input_image, sampler, in_pos);
-                        if (in_c % 4 == 0) {
-                            res.w = in.x;
-                        } else if (in_c % 4 == 1) {
-                            res.w = in.y;
-                        } else if (in_c % 4 == 2) {
-                            res.w = in.z;
-                        } else if (in_c % 4 == 3) {
-                            res.w = in.w;
-                        }
-
-                        int2 out_pos;
-                        out_pos.x = out_c4 * out_W + out_w;
-                        out_pos.y = out_nh;
-                        write_imageh(output_image, out_pos, res);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
deleted file mode 100644
index fd4cc0779976a2b4cc968f011dc6ea6d74fb8574..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-#define MIN_VALUE -FLT_MAX
-
-__kernel void pool_max(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int start_h = out_h * stride_h - pad_top;
-  int end_h = min(start_h + ksize_h, in_height);
-  start_h = max(start_h,0);
-
-  int start_w = out_w * stride_w - pad_left;
-  int end_w = min(start_w + ksize_w, in_width);
-  start_w = max(start_w,0);
-
-  const int pos_in_x = out_c * in_width;
-  const int pos_in_y = out_n * in_height;
-  half4 max_value = (half4)(MIN_VALUE);
-  for (int y = start_h; y < end_h; ++y) {
-    for (int x = start_w; x < end_w; ++x) {
-      half4 tmp = read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      max_value = max(max_value, tmp);
-    }
-  }
-
-  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imageh(output, (int2)(pos_out_x, out_nh), max_value);
-}
-
-__kernel void pool_avg(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-  const int out_n = out_nh / out_height;
-  const int out_h = out_nh % out_height;
-
-  const sampler_t sampler =
-      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-
-  int start_h = out_h * stride_h - pad_top;
-  int end_h = min(start_h + ksize_h, in_height);
-  start_h = max(start_h, 0);
-
-  int start_w = out_w * stride_w - pad_left;
-  int end_w = min(start_w + ksize_w, in_width);
-  start_w = max(start_w, 0);
-
-  const int pos_in_x = out_c * in_width;
-  const int pos_in_y = out_n * in_height;
-  half4 sum = (half4)(0.0f);
-  int num = 0 ;
-  for (int y = start_h; y < end_h; ++y) {
-    for (int x = start_w; x < end_w; ++x) {
-      sum += read_imageh(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-    }
-  }
-
-  num = ksize_w * ksize_h;
-  half4 avg = sum / num;
-
-  const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imageh(output, (int2)(pos_out_x, out_nh), avg);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl
deleted file mode 100644
index edb6138919d3025c176f6dda540f683912633fce..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/pre_post_kernel.cl
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void pre(__global const uchar *input,
-                   __global float *output){
-
-    int index = get_global_id(0);
-    output[index] = convert_float(input[index]) / 255;
-
- }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
deleted file mode 100644
index 886f62df687361fa40f9987659d0fe31e575de6a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/prior_box_kernel.cl
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void prior_box(__private const int global_size_dim0,
-                        __private const int global_size_dim1,
-                        __private const int global_size_dim2,
-                        __global float *box_width,
-                        __global float *box_height,
-                        __global float *variances_Buffer,
-                        __write_only image2d_t output_boxes,
-                        __write_only image2d_t output_variances,
-                        __private const float step_width,
-                        __private const float step_height,
-                        __private const float offset,
-                        __private const int img_width,
-                        __private const int img_height,
-                        __private const int num_priors,
-                        __private const int C,
-                        __private const int clip){
-
-                        const int out_c = get_global_id(0);
-                        const int out_nh = get_global_id(1);
-                        const int out_n = out_nh/num_priors;
-                        const int out_h = out_nh%num_priors;
-
-                        int2 output_pos;
-                        output_pos.x = out_c * 4;
-                        output_pos.y = out_nh;
-                        float center_x0 = (offset + (float)(out_c * 4)) * step_width;
-                        float center_x1 = (offset + (float)(out_c * 4 + 1)) * step_width;
-                        float center_x2 = (offset + (float)(out_c * 4 + 2)) * step_width;
-                        float center_x3 = (offset + (float)(out_c * 4 + 3)) * step_width;
-                        float center_y = ((float)out_n + offset) * step_height;
-
-                        half4 output[4];
-                        half4 variances[4];
-                        output[0].x = convert_half((center_x0 - box_width[out_h]) / (float)img_width);
-                        output[1].x = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].x = convert_half((center_x0 + box_width[out_h]) / (float)img_width);
-                        output[3].x = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].x = convert_half(variances_Buffer[0]);
-                        variances[1].x = convert_half(variances_Buffer[1]);
-                        variances[2].x = convert_half(variances_Buffer[2]);
-                        variances[3].x = convert_half(variances_Buffer[3]);
-
-                        if(C - 4 * out_c>=2){
-                        output[0].y = convert_half((center_x1 - box_width[out_h]) / (float)img_width);
-                        output[1].y = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].y = convert_half((center_x1 + box_width[out_h]) / (float)img_width);
-                        output[3].y = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].y = convert_half(variances_Buffer[0]);
-                        variances[1].y = convert_half(variances_Buffer[1]);
-                        variances[2].y = convert_half(variances_Buffer[2]);
-                        variances[3].y = convert_half(variances_Buffer[3]);
-                        }else{
-                         output[0].y = 0.0f;
-                         output[1].y = 0.0f;
-                         output[2].y = 0.0f;
-                         output[3].y = 0.0f;
-                        }
-                        if(C - 4 * out_c>=3){
-                        output[0].z = convert_half((center_x2 - box_width[out_h]) / (float)img_width);
-                        output[1].z = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].z = convert_half((center_x2 + box_width[out_h]) / (float)img_width);
-                        output[3].z = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].z = convert_half(variances_Buffer[0]);
-                        variances[1].z = convert_half(variances_Buffer[1]);
-                        variances[2].z = convert_half(variances_Buffer[2]);
-                        variances[3].z = convert_half(variances_Buffer[3]);
-                        }else{
-                        output[0].z = 0.0f;
-                        output[1].z = 0.0f;
-                        output[2].z = 0.0f;
-                        output[3].z = 0.0f;
-                        }
-                        if(C - 4 * out_c>=4){
-                        output[0].w = convert_half((center_x3 - box_width[out_h]) / (float)img_width);
-                        output[1].w = convert_half((center_y - box_height[out_h]) / (float)img_height);
-                        output[2].w = convert_half((center_x3 + box_width[out_h]) / (float)img_width);
-                        output[3].w = convert_half((center_y + box_height[out_h]) / (float)img_height);
-                        variances[0].w = convert_half(variances_Buffer[0]);
-                        variances[1].w = convert_half(variances_Buffer[1]);
-                        variances[2].w = convert_half(variances_Buffer[2]);
-                        variances[3].w = convert_half(variances_Buffer[3]);
-                        }else{
-                        output[0].w = 0.0f;
-                        output[1].w = 0.0f;
-                        output[2].w = 0.0f;
-                        output[3].w = 0.0f;
-                        }
-                        if(clip==1){
-                         output[0] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[0]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                         output[1] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[1]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                         output[2] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[2]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                         output[3] = min(max((half4)(0.0f, 0.0f, 0.0f, 0.0f), output[3]),(half4)(1.0f, 1.0f, 1.0f, 1.0f));
-                        }
-                        /*
-                        if(output_pos.x == 0 && output_pos.y == 1){
-                          float4 out = (float4)(output[0].x, output[1].x, output[2].x, output[3].x);
-                          printf("output = %v4hlf \n", out);
-
-                        }
-                        */
-
-                        write_imageh(output_boxes, (int2)(output_pos.x + 0, output_pos.y), output[0]);
-                        write_imageh(output_boxes, (int2)(output_pos.x + 1, output_pos.y), output[1]);
-                        write_imageh(output_boxes, (int2)(output_pos.x + 2, output_pos.y), output[2]);
-                        write_imageh(output_boxes, (int2)(output_pos.x + 3, output_pos.y), output[3]);
-
-                        write_imageh(output_variances, (int2)(output_pos.x + 0, output_pos.y), variances[0]);
-                        write_imageh(output_variances, (int2)(output_pos.x + 1, output_pos.y), variances[1]);
-                        write_imageh(output_variances, (int2)(output_pos.x + 2, output_pos.y), variances[2]);
-                        write_imageh(output_variances, (int2)(output_pos.x + 3, output_pos.y), variances[3]);
-
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl b/mobile/src/operators/kernel/cl/cl_kernel/relu.cl
deleted file mode 100644
index cc8f9c3742f7794c51a5e04ac4edde617af0e388..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/relu.cl
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void relu(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  write_imageh(output, (int2)(x, y), in);
-}
-
-__kernel void relu_p0(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  write_imageh(output, (int2)(x, y), in);
-}
-__kernel void relu_p1(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  write_imageh(output, (int2)(x, y), in);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl b/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
deleted file mode 100644
index 7a2f0e022f5625623c71b9c817319ae398e6f2c2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/relu6.cl
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void relu6(__read_only image2d_t input,
-                   __write_only image2d_t output,
-                    __private const float threshold){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  in = max((half4)(0.0f, 0.0f, 0.0f, 0.0f), in);
-  in = min((half4)(threshold, threshold, threshold, threshold), in);
-  write_imageh(output, (int2)(x, y), in);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl b/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
deleted file mode 100644
index 7957001c9659e94e706888b658ebf39640f61a0a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/reshape.cl
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void reshape(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_C,
-                      __private const int out_H,
-                      __private const int out_W,
-                      __private const int in_W,
-                      __private const int in_H,
-                      __private const int in_Stride0,
-                      __private const int in_Stride1,
-                      __private const int in_Stride2,
-                      __private const int out_Stride0,
-                      __private const int out_Stride1,
-                      __private const int out_Stride2) {
-
-                       const int out_c = get_global_id(0);
-                       const int out_w = get_global_id(1);
-                       const int out_nh = get_global_id(2);
-                       const int out_n = out_nh/out_H;
-                       const int out_h = out_nh%out_H;
-                       const int out_c0 = out_c * 4;
-                       const int out_c1 = out_c * 4 + 1;
-                       const int out_c2 = out_c * 4+ 2;
-                       const int out_c3 = out_c * 4+ 3;
-
-                       int count0 =  out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
-                       int count1 =  out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
-                       int count2 =  out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
-                       int count3 =  out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
-
-                       int in_n0 = count0/in_Stride2;
-                       int in_n1 = count1/in_Stride2;
-                       int in_n2 = count1/in_Stride2;
-                       int in_n3 = count2/in_Stride2;
-
-                       count0 = count0%in_Stride2;
-                       count1 = count1%in_Stride2;
-                       count2 = count2%in_Stride2;
-                       count3 = count3%in_Stride2;
-
-                       int in_c0 = count0/in_Stride1;
-                       int in_c1 = count1/in_Stride1;
-                       int in_c2 = count2/in_Stride1;
-                       int in_c3 = count3/in_Stride1;
-
-                       int in_h0 = (count0%in_Stride1)/in_Stride0;
-                       int in_h1 = (count1%in_Stride1)/in_Stride0;
-                       int in_h2 = (count2%in_Stride1)/in_Stride0;
-                       int in_h3 = (count3%in_Stride1)/in_Stride0;
-
-                       int in_w0 = (count0%in_Stride1)%in_Stride0;
-                       int in_w1 = (count1%in_Stride1)%in_Stride0;
-                       int in_w2 = (count2%in_Stride1)%in_Stride0;
-                       int in_w3 = (count3%in_Stride1)%in_Stride0;
-
-
-                       int2 input_pos0;
-                       int2 input_pos1;
-                       int2 input_pos2;
-                       int2 input_pos3;
-
-                       input_pos0.x = (in_c0/4) * in_W + in_w0;
-                       input_pos0.y = in_n0 * in_H + in_h0;
-
-                       input_pos1.x = (in_c1/4) * in_W + in_w1;
-                       input_pos1.y = in_n1 * in_H + in_h1;
-
-                       input_pos2.x = (in_c2/4) * in_W + in_w2;
-                       input_pos2.y = in_n2 * in_H + in_h2;
-
-                       input_pos3.x = (in_c3/4) * in_W + in_w3;
-                       input_pos3.y = in_n3 * in_H + in_h3;
-
-                       int2 output_pos;
-                       output_pos.x = out_c * out_W + out_w;
-                       output_pos.y = out_nh;
-
-                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                 CLK_ADDRESS_CLAMP      |
-                                                 CLK_FILTER_NEAREST;
-
-                       half4 input0;
-                       half4 input1;
-                       half4 input2;
-                       half4 input3;
-                       half4 output;
-
-                       input0 = read_imageh(input_image, sampler,input_pos0);
-                       if(in_c0%4==0){
-                          output.x = input0.x;
-                       }else if(in_c0%4==1){
-                          output.x = input0.y;
-                       }else if(in_c0%4==2){
-                          output.x = input0.z;
-                       }else{
-                          output.x = input0.w;
-                       }
-                       if(out_C - out_c * 4>=2){
-                          input1 = read_imageh(input_image, sampler,input_pos1);
-                       if(in_c1%4==0){
-                          output.y = input1.x;
-                       }else if(in_c1%4==1){
-                          output.y = input1.y;
-                       }else if(in_c1%4==2){
-                          output.y = input1.z;
-                       }else{
-                          output.y = input1.w;
-                       }
-
-                       }else{
-                          output.y = 0.0f;
-                       }
-
-                       if(out_C - out_c * 4>=3){
-                          input2 = read_imageh(input_image, sampler,input_pos2);
-
-                       if(in_c2%4==0){
-                          output.z = input2.x;
-                       }else if(in_c2%4==1){
-                          output.z = input1.y;
-                       }else if(in_c2%4==2){
-                          output.z = input2.z;
-                       }else{
-                          output.z = input2.w;
-                       }
-                       }else{
-                          output.z = 0.0f;
-                       }
-
-                       if(out_C - out_c * 4>=4){
-                          input3 = read_imageh(input_image, sampler,input_pos3);
-                       if(in_c3%4==0){
-                          output.w = input3.x;
-                       }else if(in_c3%4==1){
-                          output.w = input3.y;
-                       }else if(in_c3%4==2){
-                          output.w = input3.z;
-                       }else{
-                          output.w = input3.w;
-                       }
-                       }else{
-                          output.w = 0.0f;
-                       }
-
-                       write_imageh(output_image, output_pos, output);
-}
-
-
-/*
-
-__kernel void reshape(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3,
-                      __private const int x0,
-                      __private const int x1,
-                      __private const int x2,
-                      __private const int x3) {
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-  int obx = x / x3;
-  int oby = y / x2;
-  int ox = x % x3;
-  int oy = y % x2;
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  half4 r;
-  for (int i = 0; i < 4; i++) {
-    int t = obx * 4 + i;
-    if (t > x1) break;
-    int oindex = oby * x1 * x2 * x3 + t * x2 * x3 + ox * x3 + oy;
-    int i3 = oindex % d3; oindex /= d3;
-    int i2 = oindex % d2; oindex /= d2;
-    int i1 = oindex % d1; oindex /= d1;
-    int i0 = oindex;
-    int ix = (i1 / 4) * d3 + i3;
-    int iy = i0 * d2 + i2;
-    half4 p = read_imageh(input, sampler, (int2)(ix, iy));
-    ((half*)&r)[i] = ((half*)&p)[i1%4];
-  }
-  write_imageh(output, (int2)(x, y), r);
-}
-
-*/
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
deleted file mode 100644
index 57d775b22bd8c9207ca6e23eb12db1c731f97036..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/scale_kernel.cl
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void scale(__read_only image2d_t input,
-                    __write_only image2d_t output,
-                    __private float scale,
-                    __private float bias,
-                    __private int out_width){
-
-  const int out_c = get_global_id(0);
-  const int out_w = get_global_id(1);
-  const int out_nh = get_global_id(2);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  int pos_x = mad24(out_c, out_width, out_w);
-  half4 in = read_imageh(input, sampler, (int2)(pos_x, out_nh));
-  in = convert_half(scale) * in + convert_half(bias);
-  write_imageh(output, (int2)(pos_x, out_nh), in);
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl b/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
deleted file mode 100644
index 0a1995d42caad5fcaa2ddde340e575cdb5074a39..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/sigmoid.cl
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void sigmoid(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-   const int x = get_global_id(0);
-   const int y = get_global_id(1);
-
-   const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                             CLK_ADDRESS_CLAMP |
-                             CLK_FILTER_NEAREST;
-
-   half4 in = read_imageh(input, sampler, (int2)(x, y));
-   half4 out;
-   out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
-   out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
-   out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
-   out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
-   write_imageh(output, (int2)(x, y), out);
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
deleted file mode 100644
index aab8357d824c48ac3cab748b7f7159ed5260f1d2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/slice_kernel.cl
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void slice(__read_only image2d_t input, __write_only image2d_t output,
-                    __private const int start, __private const int end,
-                    __private const int dims_w){
-
-                    const int c = get_global_id(0);
-                    const int w = get_global_id(1);
-                    const int nh = get_global_id(2);
-                    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-
-                    int2 output_pos;
-                    output_pos.x = c * dims_w + w;
-                    output_pos.y = nh;
-
-                    int2 input_pos;
-                    half4 input_data;
-                    half4 output_data;
-
-                    if (start % 4 == 0) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data = input_data;
-                    } else if (start % 4 == 1) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.x = input_data.y;
-                        output_data.y = input_data.z;
-                        output_data.z = input_data.w;
-                        input_pos.x = input_pos.x + dims_w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.w = input_data.x;
-                    } else if (start % 4 == 2) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.x = input_data.z;
-                        output_data.y = input_data.w;
-                        input_pos.x = input_pos.x + dims_w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.z = input_data.x;
-                        output_data.w = input_data.y;
-                    } else if (start % 4 == 3) {
-                        input_pos.x = (4 * c + start) / 4 * dims_w + w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.x = input_data.w;
-                        input_pos.x = input_pos.x + dims_w;
-                        input_pos.y = nh;
-                        input_data = read_imageh(input, sampler,input_pos);
-                        output_data.y = input_data.x;
-                        output_data.z = input_data.y;
-                        output_data.w = input_data.z;
-                    }
-                    write_imageh(output, output_pos, output_data);
-
-}
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl b/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
deleted file mode 100644
index a1fa014e00d021f6ad39ac49a841b54476b63639..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/softmax.cl
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void softmax(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W
-                      ) {
-    const int out_c = get_global_id(0);   //  block index
-    const int out_w = get_global_id(1);   // index in one block
-    const int out_nh = get_global_id(2);
-
-    const int in_c = out_c;
-    const int in_w = out_w;
-    const int in_nh = out_nh;
-
-    int2 input_pos;
-    int2 output_pos;
-
-    input_pos.x = in_c * out_W + in_w;
-    input_pos.y = in_nh;
-
-    output_pos.x = out_c * out_W + out_w;
-    output_pos.y = out_nh;
-
-    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                CLK_ADDRESS_CLAMP |
-                                CLK_FILTER_NEAREST;
-
-    half4 input_max = 0.0f;
-    half4 input_tmp;
-    for(int i=0;i<out_W;i++){
-     input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
-     input_max = max(input_max,input_tmp);
-    }
-
-    half4 sum = (half4)0.0f;
-    for(int i=0;i<out_W;i++){
-        input_tmp = read_imageh(input_image, sampler,(int2)(in_c * out_W + i,in_nh));
-        sum += exp(input_tmp - input_max);
-       }
-
-       half4 input = read_imageh(input_image, sampler,input_pos);
-       half4 output = exp(input - input_max)/sum;
-       write_imageh(output_image, output_pos, output);
-
-}
-
-/*
-
-__kernel void softmax(__read_only image2d_t input,
-                      __write_only image2d_t output,
-                      __private const int d0,
-                      __private const int d1,
-                      __private const int d2,
-                      __private const int d3) {
-  const int z = get_global_id(0);
-  const int x = get_global_id(1);
-  const int y = get_global_id(2);
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-  half4 cv = read_imageh(input, sampler, (int2)(x, y));
-  half4 maxv = cv;
-  for (int i = 0; i < d3; i++) {
-    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
-    maxv = max(maxv, temp);
-  }
-  half4 sum = (half4)0.0f;
-  // half4 x = = (half4)0.0f;
-  for (int i = 0; i < d3; i++) {
-    half4 temp = read_imageh(input, sampler, (int2)(z * d3 + i, y));
-    sum += exp(temp - maxv);
-  }
-  half4 r = exp(cv - maxv) / sum;
-
-  write_imageh(output, (int2)(z * d3 + x, y), r);
-}
-
-*/
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
deleted file mode 100644
index 067a4bd1fb299730fc7cc7a42642bfde81f7261c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/tanh_kernel.cl
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void tanh_kernel(__read_only image2d_t input,
-                   __write_only image2d_t output){
-
-  const int x = get_global_id(0);
-  const int y = get_global_id(1);
-
-  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                            CLK_ADDRESS_CLAMP |
-                            CLK_FILTER_NEAREST;
-
-  half4 in = read_imageh(input, sampler, (int2)(x, y));
-  write_imageh(output, (int2)(x, y), tanh(in));
-}
-
-
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
deleted file mode 100644
index f09cf0141eceee22e972cc545082e0dc7e0a1a49..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/cl_kernel/transpose_kernel.cl
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-__kernel void transpose_4d( __read_only image2d_t input_image,
-                            __write_only image2d_t output_image,
-                            __private const int out_C,
-                            __private const int out_H,
-                            __private const int out_W,
-                            __private const int in_W
-                           ){
-                           const int out_c = get_global_id(0);
-                           const int out_w = get_global_id(1);
-                           const int out_nh = get_global_id(2);
-                           const int out_n =  1;
-                           const int out_h = out_nh%out_H;
-                           const int out_c0 = out_c * 4;
-                           const int out_c1 = out_c * 4 + 1;
-                           const int out_c2 = out_c * 4+ 2;
-                           const int out_c3 = out_c * 4+ 3;
-
-                           const int in_n = out_n;
-                           const int in_c = out_w / 4;
-                           const int in_h0 = out_c0;
-                           const int in_h1 = out_c1;
-                           const int in_h2 = out_c2;
-                           const int in_h3 = out_c3;
-                           const int in_w = out_h;
-
-                           int2 output_pos;
-                           output_pos.x = out_c * out_W + out_w;
-                           output_pos.y = out_nh;
-
-                           int2 input_pos0;
-                           int2 input_pos1;
-                           int2 input_pos2;
-                           int2 input_pos3;
-
-                           input_pos0.x = in_W * in_c + in_w;
-                           input_pos0.y = in_n * in_h0;
-
-                           input_pos1.x = in_W * in_c + in_w;
-                           input_pos1.y = in_n * in_h1;
-
-                           input_pos2.x = in_W * in_c + in_w;
-                           input_pos2.y = in_n * in_h2;
-
-                           input_pos3.x = in_W * in_c + in_w;
-                           input_pos3.y = in_n * in_h3;
-
-                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                         CLK_ADDRESS_CLAMP      |
-                                                         CLK_FILTER_NEAREST;
-
-                           half4 input0;
-                           half4 input1;
-                           half4 input2;
-                           half4 input3;
-                           half4 output;
-                           input0 = read_imageh(input_image, sampler,input_pos0);
-
-                            if(out_w%4==0){
-                            output.x = input0.x;
-                           }else if(out_w%4==1){
-                            output.x = input0.y;
-                           }else if(out_w%4==2){
-                            output.x = input0.z;
-                           }else{
-                            output.x = input0.w;
-                           }
-                           if(out_C - out_c * 4>=2){
-                            input1 = read_imageh(input_image, sampler,input_pos1);
-                            if(out_w%4==0){
-                            output.y = input1.x;
-                            }else if(out_w%4==1){
-                             output.y = input1.y;
-                            }else if(out_w%4==2){
-                             output.y = input1.z;
-                            }else{
-                             output.y = input1.w;
-                            }
-
-                           }else{
-                           output.y = 0.0f;
-                           }
-
-                           if(out_C - out_c * 4>=3){
-                            input2 = read_imageh(input_image, sampler,input_pos2);
-
-                          if(out_w%4==0){
-                            output.z = input2.x;
-                            }else if(out_w%4==1){
-                             output.z = input2.y;
-                            }else if(out_w%4==2){
-                             output.z = input2.z;
-                            }else{
-                             output.z = input2.w;
-                            }
-                           }else{
-                            output.z = 0.0f;
-                           }
-
-                           if(out_C - out_c * 4>=4){
-                            input3 = read_imageh(input_image, sampler,input_pos3);
-                           if(out_w%4==0){
-                            output.w = input3.x;
-                            }else if(out_w%4==1){
-                             output.w = input3.y;
-                            }else if(out_w%4==2){
-                             output.w = input3.z;
-                            }else{
-                             output.w = input3.w;
-                            }
-                           }else{
-                           output.w = 0.0f;
-                           }
-                           write_imageh(output_image, output_pos, output);
-}
-
-__kernel void transpose( __read_only image2d_t input_image,
-                            __write_only image2d_t output_image,
-                            __private const int out_C,
-                            __private const int out_H,
-                            __private const int out_W,
-                            __private const int in_W
-                           ){
-                           const int out_c = get_global_id(0);
-                           const int out_w = get_global_id(1);
-                           const int out_nh = get_global_id(2);
-                           const int out_n =  1;
-                           const int out_h = out_nh%out_H;
-
-                           const int in_n = 1;
-                           const int in_c = out_c;
-                           const int in_w = out_h;
-                           const int in_h = out_w;
-
-                           int2 input_pos;
-                           int2 output_pos;
-
-                           input_pos.x = in_c * in_W + in_w;
-                           input_pos.y = in_n * in_h;
-
-                           output_pos.x = out_c * out_W + out_w;
-                           output_pos.y = out_n * out_h;
-
-                           const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                      CLK_ADDRESS_CLAMP      |
-                                                      CLK_FILTER_NEAREST;
-
-                           half4 input;
-                           half4 output;
-                           input = read_imageh(input_image, sampler,input_pos);
-
-                           output = input;
-                           write_imageh(output_image, output_pos, output);
-
-}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/concat_kernel.cpp b/mobile/src/operators/kernel/cl/concat_kernel.cpp
deleted file mode 100644
index 013faa3fd173aa7a125a433b5d9a5e027268a8fd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/concat_kernel.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
-  if (param->Out()->dims().size() < 4) {
-    if (param->Out()->dims().size() - param->axis_ == 1) {
-      this->cl_helper_.AddKernel("concatByW", "concat_kernel.cl");
-    } else {
-      this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
-    }
-  } else if (param->Out()->dims().size() >= 4) {
-    if (param->Inputs().size() == 2) {
-      this->cl_helper_.AddKernel("concatByCWith2Inputs", "concat_kernel.cl");
-    } else if (param->Inputs().size() == 3) {
-      this->cl_helper_.AddKernel("concatByCWith3Inputs", "concat_kernel.cl");
-    } else if (param->Inputs().size() == 4) {
-      this->cl_helper_.AddKernel("concatByCWith4Inputs", "concat_kernel.cl");
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <>
-void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
-  if (param.Out()->dims().size() < 4) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto inputs = param.Inputs();
-    auto *output_image = param.Out()->GetCLImage();
-    int out_W = 0;
-    if (param.Out()->dims().size() == 3) {
-      out_W = param.Out()->dims()[2];
-    } else if (param.Out()->dims().size() == 2) {
-      out_W = param.Out()->dims()[1];
-    }
-    int out_H_Start = 0;
-    if (param.Out()->dims().size() - param.axis_ == 1) {
-      for (int i = 0; i < inputs.size(); i++) {
-        int pre_Width = 0;
-        for (int k = 0; k < i; ++k) {
-          pre_Width += inputs[k]->dims()[inputs[k]->dims().size() - 1];
-        }
-        int in_w = inputs[i]->dims()[param.Out()->dims().size() - 2];
-        auto input_image = inputs[i]->GetCLImage();
-        auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
-        cl_int status;
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 2, sizeof(int), &in_w);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 3, sizeof(int), &pre_Width);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-        CL_CHECK_ERRORS(status);
-        status = clEnqueueNDRangeKernel(
-            this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-            NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-        CL_CHECK_ERRORS(status);
-      }
-
-    } else {
-      for (int i = 0; i < inputs.size(); i++) {
-        auto input_image = inputs[i]->GetCLImage();
-        auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
-        cl_int status;
-        status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
-        CL_CHECK_ERRORS(status);
-        status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start);
-        CL_CHECK_ERRORS(status);
-        status = clEnqueueNDRangeKernel(
-            this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-            NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-        CL_CHECK_ERRORS(status);
-        if (param.Out()->dims().size() == 3) {
-          out_H_Start += inputs[i]->dims()[1];
-        } else if (param.Out()->dims().size() == 2) {
-          out_H_Start += inputs[i]->dims()[0];
-        }
-      }
-    }
-
-  } else {
-    auto kernel0 = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    auto inputs = param.Inputs();
-    int arg_offset;
-    cl_int status;
-    if (inputs.size() == 2) {
-      auto input_image_0 = inputs[0]->GetCLImage();
-      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-      CL_CHECK_ERRORS(status);
-      auto input_image_1 = inputs[1]->GetCLImage();
-      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
-      CL_CHECK_ERRORS(status);
-      int C_0 = inputs[0]->dims()[1];
-      status = clSetKernelArg(kernel0, 2, sizeof(int), &C_0);
-      CL_CHECK_ERRORS(status);
-      int C_1 = inputs[1]->dims()[1];
-      status = clSetKernelArg(kernel0, 3, sizeof(int), &C_1);
-      CL_CHECK_ERRORS(status);
-      arg_offset = 4;
-    } else if (inputs.size() == 3) {
-      auto input_image_0 = inputs[0]->GetCLImage();
-      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-      CL_CHECK_ERRORS(status);
-      auto input_image_1 = inputs[1]->GetCLImage();
-      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
-      CL_CHECK_ERRORS(status);
-      auto input_image_2 = inputs[2]->GetCLImage();
-      status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2);
-      CL_CHECK_ERRORS(status);
-      int C_0 = inputs[0]->dims()[1];
-      status = clSetKernelArg(kernel0, 3, sizeof(int), &C_0);
-      CL_CHECK_ERRORS(status);
-      int C_1 = inputs[1]->dims()[1];
-      status = clSetKernelArg(kernel0, 4, sizeof(int), &C_1);
-      CL_CHECK_ERRORS(status);
-      int C_2 = inputs[2]->dims()[1];
-      status = clSetKernelArg(kernel0, 5, sizeof(int), &C_2);
-      CL_CHECK_ERRORS(status);
-      arg_offset = 6;
-    } else if (inputs.size() == 4) {
-      auto input_image_0 = inputs[0]->GetCLImage();
-      status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image_0);
-      CL_CHECK_ERRORS(status);
-      auto input_image_1 = inputs[1]->GetCLImage();
-      status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &input_image_1);
-      CL_CHECK_ERRORS(status);
-      auto input_image_2 = inputs[2]->GetCLImage();
-      status = clSetKernelArg(kernel0, 2, sizeof(cl_mem), &input_image_2);
-      CL_CHECK_ERRORS(status);
-      auto input_image_3 = inputs[3]->GetCLImage();
-      status = clSetKernelArg(kernel0, 3, sizeof(cl_mem), &input_image_3);
-      CL_CHECK_ERRORS(status);
-      int C_0 = inputs[0]->dims()[1];
-      status = clSetKernelArg(kernel0, 4, sizeof(int), &C_0);
-      CL_CHECK_ERRORS(status);
-      int C_1 = inputs[1]->dims()[1];
-      status = clSetKernelArg(kernel0, 5, sizeof(int), &C_1);
-      CL_CHECK_ERRORS(status);
-      int C_2 = inputs[2]->dims()[1];
-      status = clSetKernelArg(kernel0, 6, sizeof(int), &C_2);
-      CL_CHECK_ERRORS(status);
-      int C_3 = inputs[3]->dims()[1];
-      status = clSetKernelArg(kernel0, 7, sizeof(int), &C_3);
-      CL_CHECK_ERRORS(status);
-      arg_offset = 8;
-    }
-    auto *output_image = param.Out()->GetCLImage();
-    status =
-        clSetKernelArg(kernel0, arg_offset + 0, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    int out_C = param.Out()->dims()[1];
-    status = clSetKernelArg(kernel0, arg_offset + 1, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    int out_W = param.Out()->dims()[3];
-    status = clSetKernelArg(kernel0, arg_offset + 2, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 758f60b4fb3a2cc9584ef642171eb33ecfdb79b4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-
-#include <cmath>
-
-#include "framework/cl/cl_image.h"
-#include "framework/cl/cl_tool.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddBNReluKernel<GPU_CL, float>::Init(
-    FusionConvAddBNReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  if (!param->Bias()->isInit()) {
-    param->Bias()->InitCLImage(cl_helper_.CLContext(),
-                               cl_helper_.CLCommandQueue());
-  }
-
-  //  const CL *mean = param->InputMean();
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " mean - " << j << mean->data<float>()[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " variance - " << j << variance->data<float>()[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " scale - " << j << scale->data<float>()[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " bias - " << j << bias->data<float>()[j];
-  //  }
-
-  //
-  //  DLOG << " climage mean: " << *mean;
-  //  DLOG << " climage variance: " << *variance;
-  //  DLOG << " climage scale: " << *scale;
-  //  DLOG << " climage bias: " << *bias;
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-  //  }
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - y bias: " << *(param->Bias());
-  //
-  //  DLOG << " climage - new scale: " << *new_scale;
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - new bias: " << *new_bias;
-  //
-  //  DLOG << " climage - filter: " << *(param->Filter());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options = "-DBATCH_NORM -DRELU";
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options += " -DBIASE_ELE";
-  } else {
-    build_options += " -DBIASE_CH";
-  }
-
-  /*
-  if (param->Filter()->dims()[2] == 1 &&
-      param->Filter()->dims()[3] == 1 &&
-      (param->Filter()->dims()[0] % 16) == 0) {
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_1x1_4", "conv_add_bn_relu_kernel.cl");
-    DLOG << " conv add bn relu conv 1x1 4";
-  }
-  */
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    if (param->Input()->dims()[1] % 4 == 0) {
-      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
-                                 build_options);
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
-                                 build_options);
-    }
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] != 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    // other depthwise not with filter 3x3
-    DLOG << "depth_conv basic ";
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
-    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    // std::cout << " input dim " << param->Input()->dims()[0] << "  "
-    //           << param->Input()->dims()[1] << "  " <<
-    //           param->Input()->dims()[2]
-    //           << "  " << param->Input()->dims()[3] << "  " << std::endl;
-    // std::cout << " output dim " << param->Output()->dims()[0] << " "
-    //           << param->Output()->dims()[1] << " " <<
-    //           param->Output()->dims()[2]
-    //           << " " << param->Output()->dims()[3] << " " << std::endl;
-    // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
-    //           << param->Filter()->dims()[1] << " " <<
-    //           param->Filter()->dims()[2]
-    //           << " " << param->Filter()->dims()[3] << " " << std::endl;
-
-    if (param->groups > 1) {
-      param->ExecMode() =
-          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
-    }
-    //    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvAddBNReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
-                            param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                    param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                      param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                      param.NewScale(), param.NewBias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvAddBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
deleted file mode 100644
index 5f21d3dd3e591e88555dcd9d0a9c1b01a1f38245..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_add_kernel.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  if (!param->Bias()->isInit()) {
-    param->Bias()->InitCLImage(cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue());
-  }
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options;
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options = "-DBIASE_ELE";
-  } else {
-    build_options = "-DBIASE_CH";
-  }
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-    if (param->Input()->dims()[1] % 4 == 0) {
-      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
-                                 build_options);
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] != 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
-    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    if (param->groups > 1) {
-      param->ExecMode() =
-          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
-    }
-    //    }
-
-  } else if (param->Filter()->dims()[2] == 7 &&
-             param->Filter()->dims()[3] == 7) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_7x7spl", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[2] == 5 &&
-             param->Filter()->dims()[3] == 5) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddKernel<GPU_CL, float>::Compute(
-    const FusionConvAddParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, false, param.Bias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvAddKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
deleted file mode 100644
index 16281e5cb78358ea5a6caacf3413a1b41a92b820..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<GPU_CL, float>::Init(
-    FusionConvAddReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  if (!param->Bias()->isInit()) {
-    param->Bias()->InitCLImage(cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue());
-  }
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options = "-DRELU";
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options += " -DBIASE_ELE";
-  } else {
-    build_options += " -DBIASE_CH";
-  }
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    if (param->Input()->dims()[1] % 4 == 0) {
-      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
-                                 build_options);
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
-                                 build_options);
-    }
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] != 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    DLOG << "init depwise conv basic";
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
-    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    // std::cout << " input dim " << param->Input()->dims()[0] << "  "
-    //           << param->Input()->dims()[1] << "  " <<
-    //           param->Input()->dims()[2]
-    //           << "  " << param->Input()->dims()[3] << "  " << std::endl;
-    // std::cout << " output dim " << param->Output()->dims()[0] << " "
-    //           << param->Output()->dims()[1] << " " <<
-    //           param->Output()->dims()[2]
-    //           << " " << param->Output()->dims()[3] << " " << std::endl;
-    // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
-    //           << param->Filter()->dims()[1] << " " <<
-    //           param->Filter()->dims()[2]
-    //           << " " << param->Filter()->dims()[3] << " " << std::endl;
-
-    if (param->groups > 1) {
-      param->ExecMode() =
-          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
-    }
-
-    //    }
-
-  } else if (param->Filter()->dims()[2] == 7 &&
-             param->Filter()->dims()[3] == 7) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file, build_options);
-
-  } else if (param->Filter()->dims()[2] == 5 &&
-             param->Filter()->dims()[3] == 5) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_5x5", conv_kernel_file, build_options);
-
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<GPU_CL, float>::Compute(
-    const FusionConvAddReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW5x5_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, true, param.Bias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvAddReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
deleted file mode 100644
index 7e8a44ced0d7908a761a1635890b6afba60dba78..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include "operators/kernel/conv_bn_add_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNAddReluKernel<GPU_CL, float>::Init(
-    FusionConvBNAddReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-  //  }
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - y bias: " << *(param->Bias());
-  //
-  //  DLOG << " climage - new scale: " << *new_scale;
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - new bias: " << *new_bias;
-  //
-  //  DLOG << " climage - filter: " << *(param->Filter());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  std::string build_options = "-DBATCH_NORM -DRELU";
-  if (param->Output()->dims() == param->Bias()->dims()) {
-    build_options += " -DBIASE_ELE";
-  } else {
-    build_options += " -DBIASE_CH";
-  }
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("convBNAdd_1x1_spl", conv_kernel_file,
-                               build_options);
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("depth_convBNAdd_3x3", conv_kernel_file,
-                               build_options);
-
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2_bn_add",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("convBNAdd_3x3", conv_kernel_file,
-                               build_options);
-    //    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvBNAddReluKernel<GPU_CL, float>::Compute(
-    const FusionConvBNAddReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, param.Bias(),
-                            param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, param.Bias(),
-                    param.NewScale(), param.NewBias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-template class ConvBNAddReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
deleted file mode 100644
index bd8b71b85da8d9a6ca8826732a5d6eb9d741f629..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNReluKernel<GPU_CL, float>::Init(
-    FusionConvBNReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new scale - " << j << new_scale_ptr[j];
-  //  }
-  //
-  //  for (int j = 0; j < C; ++j) {
-  //    DLOG << " new bias - " << j << new_bias_ptr[j];
-  //  }
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - y bias: " << *(param->Bias());
-  //
-  //  DLOG << " climage - new scale: " << *new_scale;
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  //  DLOG << " climage - new bias: " << *new_bias;
-  //
-  //  DLOG << " climage - filter: " << *(param->Filter());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  const std::string build_options = "-DBATCH_NORM -DRELU";
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    if (param->Input()->dims()[1] % 4 == 0) {
-      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
-                                 build_options);
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
-                                 build_options);
-    }
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] != 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
-    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    if (param->groups > 1) {
-      param->ExecMode() =
-          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
-    }
-    //    }
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-  return true;
-}
-
-template <>
-void ConvBNReluKernel<GPU_CL, float>::Compute(
-    const FusionConvBNReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true, nullptr,
-                            param.NewScale(), param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
-                    param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
-                      param.NewBias());
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, true, nullptr, param.NewScale(),
-                      param.NewBias());
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-template class ConvBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_kernel.cpp b/mobile/src/operators/kernel/cl/conv_kernel.cpp
deleted file mode 100644
index 054eab85ab3d071204a902a6673c0176ff09e3da..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_kernel.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  DLOG << " init helper: " << &cl_helper_;
-  DLOG << " conv kernel add kernel ~ ";
-  DLOG << " width of one block: " << param->Filter()->dims()[3];
-  DLOG << " height of one block: " << param->Filter()->dims()[2];
-  DLOG << " filter dims: " << param->Filter()->dims();
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    if (param->Input()->dims()[1] % 4 == 0) {
-      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file);
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file);
-    }
-    DLOG << "conv 1x1";
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file);
-    }
-    DLOG << "depth_conv 3x3";
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] != 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
-    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file);
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl");
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    // std::cout << " input dim " << param->Input()->dims()[0] << "  "
-    //           << param->Input()->dims()[1] << "  " <<
-    //           param->Input()->dims()[2]
-    //           << "  " << param->Input()->dims()[3] << "  " << std::endl;
-    // std::cout << " output dim " << param->Output()->dims()[0] << " "
-    //           << param->Output()->dims()[1] << " " <<
-    //           param->Output()->dims()[2]
-    //           << " " << param->Output()->dims()[3] << " " << std::endl;
-    // std::cout << " filter dim " << param->Filter()->dims()[0] << " "
-    //           << param->Filter()->dims()[1] << " " <<
-    //           param->Filter()->dims()[2]
-    //           << " " << param->Filter()->dims()[3] << " " << std::endl;
-    if (param->groups > 1) {
-      param->ExecMode() =
-          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file);
-    }
-
-    //    }
-    DLOG << "conv 3x3";
-  } else if (param->Filter()->dims()[2] == 7 &&
-             param->Filter()->dims()[3] == 7) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT;
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    this->cl_helper_.AddKernel("conv_7x7", conv_kernel_file);
-    //    }
-    DLOG << "conv 7x7";
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvKernel<GPU_CL, float>::Compute(const ConvParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW7x7_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param);
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp b/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
deleted file mode 100644
index 35511331a5755f7c26212f578f0c5bcc5a2b46f0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_relu_kernel.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVRELU_OP
-
-#include "operators/kernel/conv_relu_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvReluKernel<GPU_CL, float>::Init(FusionConvReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-
-  DLOG << " init helper: " << &cl_helper_;
-  DLOG << " conv kernel add kernel ~ ";
-  DLOG << " width of one block: " << param->Filter()->dims()[3];
-  DLOG << " height of one block: " << param->Filter()->dims()[2];
-  DLOG << " filter dims: " << param->Filter()->dims();
-
-  const std::string conv_kernel_file = "conv_kernel.cl";
-  const std::string wino_kernel_file = "winograd_transform.cl";
-  const std::string build_options = "-DRELU";
-
-  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT;
-    param->Filter()->InitNImage(cl_helper_.CLContext(),
-                                cl_helper_.CLCommandQueue());
-
-    if (param->Input()->dims()[1] % 4 == 0) {
-      this->cl_helper_.AddKernel("conv_1x1_simple", conv_kernel_file,
-                                 build_options);
-    } else {
-      this->cl_helper_.AddKernel("conv_1x1_wrapped", conv_kernel_file,
-                                 build_options);
-    }
-    DLOG << "conv 1x1";
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] == 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->Strides()[0] == 1 && param->Dilations()[0] == 1) {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3s1", conv_kernel_file,
-                                 build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT;
-      this->cl_helper_.AddKernel("depth_conv_3x3", conv_kernel_file,
-                                 build_options);
-    }
-
-    DLOG << "depth_conv 3x3";
-
-  } else if (param->Filter()->dims()[1] == 1 &&
-             param->Input()->dims()[1] == param->Output()->dims()[1] &&
-             param->Filter()->dims()[2] != 3) {
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-
-    param->ExecMode() = ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT;
-    this->cl_helper_.AddKernel("depth_conv", conv_kernel_file, build_options);
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3) {
-    //    if (param->Strides()[0] == param->Strides()[1] &&
-    //        param->Strides()[0] == 1 && param->Input()->dims()[2] >= 32) {
-    //      param->ExecMode() = ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT;
-    //      this->cl_helper_.AddKernel("winograd_filter_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("winograd_input_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //      this->cl_helper_.AddKernel("matmul", "matmul.cl", build_options);
-    //      this->cl_helper_.AddKernel("winograd_output_transform_2x2",
-    //                                 wino_kernel_file, build_options);
-    //
-    //      winograd_transform_weight<4, 3>(&this->cl_helper_, param->Filter());
-    //
-    //    } else {
-    param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    if (param->groups > 1) {
-      param->ExecMode() =
-          ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3", conv_kernel_file, build_options);
-    } else {
-      param->ExecMode() = ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT;
-      this->cl_helper_.AddKernel("conv_3x3spl", conv_kernel_file,
-                                 build_options);
-    }
-    //    }
-    DLOG << "conv 3x3";
-
-  } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
-  }
-
-  return true;
-}
-
-template <>
-void ConvReluKernel<GPU_CL, float>::Compute(
-    const FusionConvReluParam<GPU_CL> &param) {
-  switch (param.ExecMode()) {
-    case ConvParam<GPU_CL>::EXEC_WINOGRAD3X3_FLOAT:
-      WinogradConv3x3<4, 3>(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW1x1_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3_FLOAT:
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISEBASIC_FLOAT:
-      ConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_DEPTHWISE3x3S1_FLOAT:
-      DWConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3S1_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    case ConvParam<GPU_CL>::EXEC_SLIDINGWINDOW3x3_FLOAT:
-      SWConvAddBnRelu(&this->cl_helper_, param, true);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION("Invalid convolution execute mode %d",
-                                    param.ExecMode());
-  }
-}
-
-template class ConvReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
deleted file mode 100644
index 4261681f3ec2b740516a42785bee30dc843b3a71..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<GPU_CL, float>::Init(
-    ConvTransposeParam<GPU_CL>* param) {
-  PADDLE_MOBILE_ENFORCE(param->Strides()[0] == param->Strides()[1] &&
-                            param->Paddings()[0] == param->Paddings()[1] &&
-                            param->Dilations()[0] == param->Dilations()[1] &&
-                            param->Dilations()[0] == 1,
-                        "need equal");
-
-  if (param->Filter()->dims()[1] == 1 &&
-      param->Input()->dims()[1] == param->Output()->dims()[1]) {
-    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT;
-    param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                                 cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("depthwise_transpose",
-                               "conv_transpose_kernel.cl");
-  } else if (param->Filter()->dims()[2] == 3 &&
-             param->Filter()->dims()[3] == 3 && param->Strides()[0] == 2) {
-    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT;
-    param->Filter()->InitConv2dTransposeFilterCLImage(
-        cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_transpose3x3s2",
-                               "conv_transpose_kernel.cl");
-  } else {
-    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS_FLOAT;
-    param->Filter()->InitConv2dTransposeFilterCLImage(
-        cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
-    this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<GPU_CL, float>::Compute(
-    const ConvTransposeParam<GPU_CL>& param) {
-  switch (param.ExecMode()) {
-    case ConvTransposeParam<GPU_CL>::EXEC_DEPTHWISETRANS_FLOAT:
-      DWConvTransposeAddBnRelu(&this->cl_helper_, param);
-      break;
-    case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT:
-      ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param);
-      break;
-    case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS_FLOAT:
-      ConvTransposeAddBnRelu(&this->cl_helper_, param);
-      break;
-    default:
-      PADDLE_MOBILE_THROW_EXCEPTION(
-          "Invalid convolution transpose execute mode %d", param.ExecMode());
-  }
-}
-
-template class ConvTransposeKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp b/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
deleted file mode 100644
index 1a5cf0f061606d82076ce0f231e03ba3b36753a0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/density_prior_box_kernel.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DENSITY_PRIORBOX_OP
-
-#include <operators/kernel/prior_box_kernel.h>
-#include "framework/cl/cl_tensor.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DensityPriorBoxKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::DensityPriorBoxParam<paddle_mobile::GPU_CL>
-        *param) {
-  this->cl_helper_.AddKernel("density_prior_box",
-                             "density_prior_box_kernel.cl");
-  vector<float> fixed_sizes = param->FixedSizes();
-  vector<float> fixed_ratios = param->FixedRatios();
-  vector<int> densities = param->Densities();
-  vector<float> variances = param->Variances();
-  int fix_ratio_size = fixed_ratios.size();
-  int total_size = densities.size() + fixed_sizes.size() + fix_ratio_size;
-  float *densities_data = new float[total_size];
-  for (int i = 0; i < densities.size(); ++i) {
-    float density = densities[i];
-    densities_data[i] = density;
-  }
-
-  for (int k = 0; k < fixed_sizes.size(); ++k) {
-    densities_data[k + densities.size()] = fixed_sizes[k];
-  }
-
-  for (int j = 0; j < fixed_ratios.size(); ++j) {
-    float sqrt_ratios = sqrt(fixed_ratios[j]);
-    densities_data[j + densities.size() + fixed_sizes.size()] = sqrt_ratios;
-  }
-
-  framework::CLImage *new_density = new framework::CLImage();
-  new_density->SetTensorData(densities_data, {1, 1, 1, total_size});
-  new_density->InitCLImage(this->cl_helper_.CLContext(),
-                           this->cl_helper_.CLCommandQueue());
-  param->setNewDensity(new_density);
-
-  delete[](densities_data);
-
-  return true;
-}
-
-template <>
-void DensityPriorBoxKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::DensityPriorBoxParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto *input = param.Input();
-  const auto input_dims = input->dims();
-  const auto input_image_dims = param.InputImage()->dims();
-
-  auto output_boxes = param.OutputBoxes()->GetCLImage();
-  auto output_var = param.OutputVariances()->GetCLImage();
-  auto new_density = param.getNewDensity()->GetCLImage();
-
-  float step_w = param.StepW();
-  float step_h = param.StepH();
-  float offset = param.Offset();
-  vector<float> fixed_sizes = param.FixedSizes();
-  vector<float> fixed_ratios = param.FixedRatios();
-  vector<int> densities = param.Densities();
-  vector<float> variances = param.Variances();
-
-  // feature map
-  auto input_heigh = input_dims[2];
-  auto input_width = input_dims[3];
-
-  auto image_heigh = input_image_dims[2];
-  auto image_width = input_image_dims[3];
-
-  const int C = param.OutputBoxes()->dims()[1];
-
-  if (step_w == 0 || step_h == 0) {
-    step_h = static_cast<float>(image_heigh) / input_heigh;
-    step_w = static_cast<float>(image_width) / input_width;
-  }
-  int num_density = 0;
-  for (int l = 0; l < densities.size(); ++l) {
-    num_density += densities[l] * densities[l] * fixed_ratios.size();
-  }
-
-  param.OutputBoxes()->Resize({input_heigh, input_width, num_density, 4});
-  int step_average = static_cast<int>((step_w + step_h) * 0.5);
-  int densities_and_fixedsize_size = densities.size();
-  int fix_ratio_size = fixed_ratios.size();
-
-  auto default_work = this->cl_helper_.DefaultWorkSize(*param.OutputBoxes());
-
-  float variances0 = variances[0];
-  float variances1 = variances[1];
-  float variances2 = variances[2];
-  float variances3 = variances[3];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &output_boxes);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_var);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_density);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(float), &step_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(float), &step_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &variances0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &variances1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &variances2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &variances3);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(float), &offset);
-  CL_CHECK_ERRORS(status);
-  status =
-      clSetKernelArg(kernel, 10, sizeof(int), &densities_and_fixedsize_size);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &image_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &image_heigh);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &num_density);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 15, sizeof(int), &step_average);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 16, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 17, sizeof(int), &default_work[0]);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 18, sizeof(int), &fix_ratio_size);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel,
-                                  default_work.size(), NULL,
-                                  default_work.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp b/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
deleted file mode 100644
index 372c25b596206812a664e49c57e3108c607d513f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License. */
-//
-//#ifdef DEQUANT_OP
-//
-//#include "operators/kernel/dequantize_kernel.h"
-//
-// namespace paddle_mobile {
-// namespace operators {
-//
-// template <>
-// bool DequantizeKernel<GPU_CL, float>::Init(DequantizeParam<GPU_CL> *param) {
-//  DLOG << " depthwise conv kernel init begin ";
-//  PADDLE_MOBILE_ENFORCE(
-//      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-//          param->Paddings()[0] == param->Paddings()[1],
-//      "need equal");
-//  param->Filter()->InitCLImage(cl_helper_.CLContext(),
-//                               this->cl_helper_.CLCommandQueue());
-//  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-//               static_cast<int>(param->Paddings()[1]);
-//  param->SetOffset(offset);
-//  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
-//  DLOG << " depthwise conv kernel init end ";
-//  return true;
-//}
-//
-// template <>
-// void DequantizeKernel<GPU_CL, float>::Compute(
-//    const DequantizeParam<GPU_CL> &param) {
-//  auto kernel = this->cl_helper_.KernelAt(0);
-//  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-//  int c_block = default_work_size[0];
-//  int w = default_work_size[1];
-//  int nh = default_work_size[2];
-//  auto input = param.Input()->GetCLImage();
-//  auto filter = param.Filter()->GetCLImage();
-//  auto output = param.Output()->GetCLImage();
-//  int stride = param.Strides()[0];
-//  int offset = param.Offset();
-//  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-//                    param.Input()->Converter())
-//                    ->GetCBlock();
-//  int dilation = param.Dilations()[0];
-//
-//  int input_width = param.Input()->dims()[3];
-//  int input_height = param.Input()->dims()[2];
-//  int output_width = param.Output()->dims()[3];
-//  int output_height = param.Output()->dims()[2];
-//
-//  cl_int status;
-//
-//  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-//  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-//  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-//  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-//  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-//  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-//  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-//  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-//  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-//  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-//  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-//  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-//  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-//  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-//
-//  CL_CHECK_ERRORS(status);
-//
-//  //  cl_event out_event = param.Output()->GetClEvent();
-//  //  cl_event wait_event = param.Input()->GetClEvent();
-//
-//  status = clEnqueueNDRangeKernel(
-//      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-//      NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-//
-//  CL_CHECK_ERRORS(status);
-//}
-//
-// template class DepthwiseConvKernel<GPU_CL, float>;
-//
-//}  // namespace operators
-//}  // namespace paddle_mobile
-//
-//#endif
diff --git a/mobile/src/operators/kernel/cl/dropout_kernel.cpp b/mobile/src/operators/kernel/cl/dropout_kernel.cpp
deleted file mode 100644
index db9437841b20e8afa3aeaae9e0282056dc8441a7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/dropout_kernel.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<GPU_CL, float>::Init(DropoutParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("dropout", "dropout_kernel.cl");
-  return true;
-}
-
-template <>
-void DropoutKernel<GPU_CL, float>::Compute(const DropoutParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto *input_image = param.InputX()->GetCLImage();
-  auto *output_image = param.Out()->GetCLImage();
-  const float dropoutProb = param.DropoutProb();
-  const auto &inputDim = param.InputX()->dims();
-  int input_dims[4] = {1, 1, 1, 1};
-  // 1 1000 1 1
-  for (int i = 0; i < inputDim.size(); i++) {
-    input_dims[4 - inputDim.size() + i] = inputDim[i];
-  }
-  int out_W = input_dims[1];
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(float), &dropoutProb);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
deleted file mode 100644
index 03362a8d9f73fc18b13b546eabcd92d51663a65d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/dwconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include "operators/kernel/dwconv_bn_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DWConvBNReluKernel<GPU_CL, float>::Init(
-    FusionDWConvBNReluParam<GPU_CL> *param) {
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  const framework::CLImage *mean = param->InputMean();
-  const framework::CLImage *variance = param->InputVariance();
-  const framework::CLImage *scale = param->InputScale();
-  const framework::CLImage *bias = param->InputBias();
-  const float epsilon = param->Epsilon();
-
-  const int C = mean->numel();
-
-  auto mean_ptr = mean->data<float>();
-  auto variance_ptr = variance->data<float>();
-  auto scale_ptr = scale->data<float>();
-  auto bias_ptr = bias->data<float>();
-
-  float inv_std_ptr[C];
-  for (int i = 0; i < C; i++) {
-    inv_std_ptr[i] =
-        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
-  }
-  float *new_scale_ptr = new float[C];
-  float *new_bias_ptr = new float[C];
-
-  for (int i = 0; i < C; i++) {
-    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
-    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
-  }
-
-  framework::CLImage *new_scale = new framework::CLImage();
-
-  new_scale->SetTensorData(new_scale_ptr, variance->dims());
-  new_scale->InitCLImage(this->cl_helper_.CLContext(),
-                         cl_helper_.CLCommandQueue());
-
-  framework::CLImage *new_bias = new framework::CLImage();
-
-  new_bias->SetTensorData(new_bias_ptr, variance->dims());
-  new_bias->InitCLImage(this->cl_helper_.CLContext(),
-                        cl_helper_.CLCommandQueue());
-
-  param->SetNewScale(new_scale);
-  param->SetNewBias(new_bias);
-
-  delete[](new_scale_ptr);
-  delete[](new_bias_ptr);
-
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-
-  param->SetOffset(offset);
-
-  param->Filter()->InitDWImage(cl_helper_.CLContext(),
-                               cl_helper_.CLCommandQueue());
-  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_bn_relu_kernel.cl");
-  DLOG << " conv bn relu depth_conv_3x3";
-
-  return true;
-}
-
-template <>
-void DWConvBNReluKernel<GPU_CL, float>::Compute(
-    const FusionDWConvBNReluParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto new_scale = param.NewScale()->GetCLImage();
-  auto new_bias = param.NewBias()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &new_scale);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_bias);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 8, sizeof(int), &stride);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 9, sizeof(int), &offset);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 11, sizeof(int), &dilation);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 12, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 13, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 14, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, 15, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-template class DWConvBNReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
deleted file mode 100644
index 06d718601cc885ac100dc29a4879b88ce9384736..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<GPU_CL, float>::Init(
-    ElementwiseAddParam<GPU_CL> *param) {
-  DLOG << "-----init add-----";
-  CLImage *bias =
-      reinterpret_cast<CLImage *>(const_cast<CLImage *>(param->InputY()));
-  if (bias->dims().size() == 4) {
-    if (!bias->isInit()) {
-      bias->InitNormalCLImage(cl_helper_.CLContext(),
-                              this->cl_helper_.CLCommandQueue());
-    }
-    DLOG << " bias: " << *bias;
-    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");
-  } else if (param->InputY()->dims().size() == 1) {
-    if (param->Axis() == param->InputX()->dims().size() - 1) {
-      if (!bias->isInit()) {
-        bias->InitNormalCLImage(cl_helper_.CLContext(),
-                                this->cl_helper_.CLCommandQueue());
-      }
-      DLOG << " bias: " << *bias;
-      this->cl_helper_.AddKernel("width_add", "channel_add_kernel.cl");
-    } else if (param->Axis() == param->InputX()->dims().size() - 3) {
-      if (!bias->isInit()) {
-        bias->InitCLImage(cl_helper_.CLContext(),
-                          this->cl_helper_.CLCommandQueue());
-      }
-      DLOG << " bias: " << *bias;
-      this->cl_helper_.AddKernel("channel_add", "channel_add_kernel.cl");
-    } else {
-      DLOG << "error:bias dims is error";
-    }
-  } else {
-    DLOG << "error:bias dims is error";
-  }
-  return true;
-}
-
-template <>
-void ElementwiseAddKernel<GPU_CL, float>::Compute(
-    const ElementwiseAddParam<GPU_CL> &param) {
-  auto input = param.InputX();
-  auto bias = param.InputY();
-  auto output = param.Out();
-  cl_int status;
-  auto kernel = this->cl_helper_.KernelAt(0);
-  if (bias->dims().size() == 4) {
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 1) {
-    if (param.Axis() == param.InputX()->dims().size() - 1 ||
-        param.Axis() == param.InputX()->dims().size() - 3) {
-      cl_mem input_image = input->GetCLImage();
-      cl_mem bias_image = bias->GetCLImage();
-      cl_mem output_image = output->GetCLImage();
-      int tensor_w = input->dims()[input->dims().size() - 1];
-      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&input_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&bias_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&output_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                              reinterpret_cast<void *>(&tensor_w));
-      CL_CHECK_ERRORS(status);
-      auto width = input->ImageWidth();
-      auto height = input->ImageHeight();
-      DLOG << "dede:" << width << "," << height;
-      size_t global_work_size[2] = {width, height};
-      cl_event out_event = param.Out()->GetClEvent();
-      cl_event wait_event = param.InputX()->GetClEvent();
-      status =
-          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                 NULL, global_work_size, NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else {
-      DLOG << "error:bias dims is error";
-    }
-  } else {
-    DLOG << "error:bias dims is error";
-  }
-}
-
-template class ElementwiseAddKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
deleted file mode 100644
index 51a213026b8c9b5f44fc9690e1cf4f6baf2a7276..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include <framework/cl/cl_half.h>
-#include <iostream>
-#include "framework/cl/cl_image.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseMulKernel<GPU_CL, float>::Init(
-    ElementwiseMulParam<GPU_CL> *param) {
-  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
-      const_cast<framework::CLImage *>(param->InputY()));
-  if (bias->dims() == param->InputX()->dims()) {
-    DLOG << "init element wise mul";
-    this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else {
-    const int bias_dim_size = bias->dims().size();
-    if (bias_dim_size == 1) {
-      DLOG << "init channel_mul";
-      this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
-    } else if (bias_dim_size == 2) {
-      // etc. input  1 72 28 28
-      // filter 1 72
-      DLOG << "init channel_mul_d2";
-      this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
-    } else if (bias_dim_size == 3) {
-      DLOG << "init channel_mul_d3";
-      this->cl_helper_.AddKernel("channel_mul_d3", "elementwise_mul_kernel.cl");
-    } else if (bias_dim_size == 4) {
-      DLOG << "init channel_mul_d4";
-      this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl");
-    } else {
-      PADDLE_MOBILE_ENFORCE(false,
-                            "element mul not supported this situation yet");
-    }
-  }
-  return true;
-}
-template <>
-void ElementwiseMulKernel<GPU_CL, float>::Compute(
-    const ElementwiseMulParam<GPU_CL> &param) {
-  auto input = param.InputX();
-  auto bias = param.InputY();
-  auto output = param.Out();
-  cl_int status;
-  auto kernel = this->cl_helper_.KernelAt(0);
-  if (bias->dims() == input->dims()) {
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else {
-    const int bias_dim_size = bias->dims().size();
-    if (bias_dim_size == 1) {
-      DLOG << "channel mul";
-      cl_mem input_image = input->GetCLImage();
-      cl_mem bias_image = bias->GetCLImage();
-      cl_mem output_image = output->GetCLImage();
-      int tensor_w = input->dims()[input->dims().size() - 1];
-      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&input_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&bias_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&output_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                              reinterpret_cast<void *>(&tensor_w));
-      CL_CHECK_ERRORS(status);
-      auto width = input->ImageWidth();
-      auto height = input->ImageHeight();
-      size_t global_work_size[2] = {width, height};
-      status =
-          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                 NULL, global_work_size, NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else if (bias_dim_size == 2) {
-      DLOG << "channel mul d2";
-
-      // etc. input  1 72 28 28
-      // filter 1 72   -->  1 1 1 72
-      DLOG << "input->ImageDims():  " << input->ImageDims();
-      DLOG << "bias->ImageDims():  " << bias->ImageDims();
-      DLOG << "out->ImageDims():  " << output->ImageDims();
-
-      DLOG << "channel mul d2";
-      cl_mem input_image = input->GetCLImage();
-      cl_mem bias_image = bias->GetCLImage();
-      cl_mem output_image = output->GetCLImage();
-      int tensor_w = input->dims()[input->dims().size() - 1];
-      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&input_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&bias_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&output_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                              reinterpret_cast<void *>(&tensor_w));
-      CL_CHECK_ERRORS(status);
-      auto width = input->ImageWidth();
-      auto height = input->ImageHeight();
-      size_t global_work_size[2] = {width, height};
-      status =
-          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                 NULL, global_work_size, NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-
-      //    bias->PrintTensor(*bias);
-    } else if (bias_dim_size == 3) {
-      DLOG << "channel_mul_d3";
-      // etc. input  1 72 28 28
-      // filter 1 72   -->  1 1 1 72
-      DLOG << "input->ImageDims():  " << input->ImageDims();
-      DLOG << "bias->ImageDims():  " << bias->ImageDims();
-      DLOG << "out->ImageDims():  " << output->ImageDims();
-
-      DLOG << "channel mul d3";
-      cl_mem input_image = input->GetCLImage();
-      cl_mem bias_image = bias->GetCLImage();
-      cl_mem output_image = output->GetCLImage();
-      int tensor_w = input->dims()[input->dims().size() - 1];
-      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&input_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&bias_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&output_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                              reinterpret_cast<void *>(&tensor_w));
-      CL_CHECK_ERRORS(status);
-      auto width = input->ImageWidth();
-      auto height = input->ImageHeight();
-      size_t global_work_size[2] = {width, height};
-      status =
-          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                 NULL, global_work_size, NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else if (bias_dim_size == 4) {
-      DLOG << "channel_mul_d4";
-      // etc. input  1 72 28 28
-      // filter 1 72   -->  1 1 1 72
-      DLOG << "input->ImageDims():  " << input->ImageDims();
-      DLOG << "bias->ImageDims():  " << bias->ImageDims();
-      DLOG << "out->ImageDims():  " << output->ImageDims();
-
-      DLOG << "channel mul d4";
-      cl_mem input_image = input->GetCLImage();
-      cl_mem bias_image = bias->GetCLImage();
-      cl_mem output_image = output->GetCLImage();
-      int tensor_w = input->dims()[input->dims().size() - 1];
-      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&input_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&bias_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                              reinterpret_cast<void *>(&output_image));
-      CL_CHECK_ERRORS(status);
-      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                              reinterpret_cast<void *>(&tensor_w));
-      CL_CHECK_ERRORS(status);
-      auto width = input->ImageWidth();
-      auto height = input->ImageHeight();
-      size_t global_work_size[2] = {width, height};
-      status =
-          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                 NULL, global_work_size, NULL, 0, NULL, NULL);
-      CL_CHECK_ERRORS(status);
-    } else {
-      PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
-    }
-  }
-}
-
-template class ElementwiseMulKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
deleted file mode 100644
index b107b3de3c1df163e9f987c9a8cdff23b6a71c43..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISESUB_OP
-
-#include "operators/kernel/elementwise_sub_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseSubKernel<GPU_CL, float>::Init(
-    ElementwiseSubParam<GPU_CL> *param) {
-  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
-      const_cast<framework::CLImage *>(param->InputY()));
-  if (bias->dims().size() == 4) {
-    if (!bias->isInit()) {
-      bias->InitNormalCLImage(cl_helper_.CLContext(),
-                              this->cl_helper_.CLCommandQueue());
-    }
-    DLOG << " bias: " << *bias;
-    this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
-  } else {
-    DLOG << "error:bias dims not support";
-  }
-  return true;
-}
-
-template <>
-void ElementwiseSubKernel<GPU_CL, float>::Compute(
-    const ElementwiseSubParam<GPU_CL> &param) {
-  auto input = param.InputX();
-  auto bias = param.InputY();
-  auto output = param.Out();
-  cl_int status;
-  auto kernel = this->cl_helper_.KernelAt(0);
-  if (bias->dims().size() == 4) {
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else {
-    DLOG << "error:bias dims not support";
-  }
-}
-
-template class ElementwiseSubKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/exp_kernel.cpp b/mobile/src/operators/kernel/cl/exp_kernel.cpp
deleted file mode 100644
index 76cbae1efddf5dedbc787777f68e5470438b8f1b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/exp_kernel.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef EXP_OP
-
-#include <framework/cl/cl_tensor.h>
-#include <operators/kernel/exp_kernel.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool EXPKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>* param) {
-  this->cl_helper_.AddKernel("exp_impl", "exp_kernel.cl");
-  return true;
-}
-
-template <>
-void EXPKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::EXPParam<paddle_mobile::GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class EXPKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp
deleted file mode 100644
index f424a31b4f5e143b8662376ddfcec122beffb408..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/expand_kernel.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef EXPAND_OP
-
-#include "operators/kernel/expand_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ExpandKernel<GPU_CL, float>::Init(ExpandParam<GPU_CL>* param) {
-  const framework::DDim& input_dims = param->InputX()->dims();
-  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4,
-                        "expend now support 4 size dims");
-  if (input_dims[1] == 1) {
-    this->cl_helper_.AddKernel("expend_c1", "expend.cl");
-  } else if (input_dims[1] == 2) {
-    this->cl_helper_.AddKernel("expend_c2", "expend.cl");
-  } else if (input_dims[1] == 4) {
-    this->cl_helper_.AddKernel("expend_c4", "expend.cl");
-  } else {
-    PADDLE_MOBILE_ENFORCE(false, "expend did not supported this type");
-  }
-  return true;
-}
-
-template <>
-void ExpandKernel<GPU_CL, float>::Compute(const ExpandParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  DLOG << "param.Out()->dims():  " << param.Out()->dims();
-  const framework::DDim& image_dims = param.Out()->ImageDims();
-  DLOG << "param.Out()->image_dims():  " << image_dims;
-
-  auto out_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-  DLOG << "out_work_size:  " << out_work_size;
-
-  int out_c_block = out_work_size[0];
-  int out_w = out_work_size[1];
-  int out_nh = out_work_size[2];
-
-  auto in_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
-  int in_c_block = in_work_size[0];
-  int in_w = in_work_size[1];
-  int in_nh = in_work_size[2];
-
-  int input_width = param.InputX()->dims()[3];
-  int input_height = param.InputX()->dims()[2];
-  int output_width = param.Out()->dims()[3];
-  int output_height = param.Out()->dims()[2];
-
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  vector<int> expandTimes = {1, 1, 1, 1};
-  DLOG << "param.expand_times: " << param.expand_times;
-
-  for (int i = 0; i < param.expand_times.size(); ++i) {
-    expandTimes[i] = param.expand_times[i];
-  }
-
-  DLOG << "expandTimes: " << expandTimes;
-
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-
-  input->dims();
-
-  int idx = 0;
-
-  cl_int status;
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_nh);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_height);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[0]);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[1]);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[2]);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[3]);
-  CL_CHECK_ERRORS(status);
-
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             out_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  DLOG << *output;
-}
-
-template class ExpandKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/feed_kernel.cpp b/mobile/src/operators/kernel/cl/feed_kernel.cpp
deleted file mode 100644
index f96059593459d7fd95e236473b3ca3c5cd1420fc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/feed_kernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<GPU_CL, float>::Init(FeedParam<GPU_CL> *param) {
-  DLOG << "Init feed";
-  if (this->pre_post_type_ == UINT8_255) {
-    this->cl_helper_.AddKernel("feed_with_pre", "feed_kernel.cl");
-  } else {
-    this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
-  const int col = param.Col();
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  cl_int status;
-  auto output = param.Out();
-  const Tensor *input = &param.InputX()->at(col);
-  //  DLOG << *input;
-
-  int numel = input->numel();
-  cl_mem output_image = output->GetCLImage();
-  const int out_C = output->dims()[1];
-  const int out_H = output->dims()[2];
-  const int out_W = output->dims()[3];
-  const int Stride2 = out_C * out_H * out_W;
-  const int Stride1 = out_H * out_W;
-  const int Stride0 = out_W;
-  framework::CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
-                                      this->cl_helper_.CLCommandQueue());
-  input_cl_tensor.Resize(input->dims());
-  cl_mem inputBuffer;
-  if (this->pre_post_type_ == UINT8_255) {
-    inputBuffer =
-        input_cl_tensor.mutable_with_data<uint8_t>(input->data<uint8_t>());
-  } else {
-    inputBuffer =
-        input_cl_tensor.mutable_with_data<float>(input->data<float>());
-  }
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class FeedKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/cl/fetch_kernel.cpp b/mobile/src/operators/kernel/cl/fetch_kernel.cpp
deleted file mode 100644
index df2c2e1f5c2df08897c4d00db1f80d79f4c13c25..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/fetch_kernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/fetch_kernel.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<GPU_CL, float>::Init(FetchParam<GPU_CL> *param) {
-  if (this->pre_post_type_ == UINT8_255) {
-    this->cl_helper_.AddKernel("fetch_with_post", "fetch_kernel.cl");
-  } else {
-    this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void FetchKernel<GPU_CL, float>::Compute(const FetchParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
-
-  const int col = param.Col();
-  auto input = param.InputX()->GetCLImage();
-  auto *out = &param.Out()->at(col);
-  out->Resize(param.InputX()->dims());
-
-  DLOG << "fetch kernel out dims = " << out->dims();
-  DLOG << "fetch kernel out memory size = " << out->memory_size();
-
-  auto dim = param.InputX()->dims();
-  size_t new_dims[] = {1, 1, 1, 1};
-
-  for (int j = 0; j < dim.size(); ++j) {
-    new_dims[4 - dim.size() + j] = dim[j];
-  }
-
-  size_t in_ch, in_height, in_width;
-
-  in_ch = new_dims[1];
-  in_height = new_dims[2];
-  in_width = new_dims[3];
-  int size_ch = in_height * in_width;
-  int size_block = size_ch * 4;
-  int size_batch = size_ch * in_ch;
-
-  framework::CLTensor out_cl_tensor(this->cl_helper_.CLContext(),
-                                    this->cl_helper_.CLCommandQueue());
-  out_cl_tensor.Resize(out->dims());
-  cl_mem outBuffer;
-  if (this->pre_post_type_ == UINT8_255) {
-    out->mutable_data<uint8_t>();
-    outBuffer = out_cl_tensor.mutable_data<uint8_t>();
-  } else {
-    out->mutable_data<float>();
-    outBuffer = out_cl_tensor.mutable_data<float>();
-  }
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &outBuffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &size_ch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &size_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &size_batch);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &in_ch);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event wait_event = param.InpdutX()->GetClEvent();
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  clFinish(this->cl_helper_.CLCommandQueue());
-
-  DLOG << "fetch kernel out dims = " << out->dims();
-  DLOG << "fetch kernel out memory size = " << out->memory_size();
-
-  DLOG << "fetch kernel out_cl_tensor dims = " << out_cl_tensor.dims();
-  DLOG << "fetch kernel out_cl_tensor memery size = "
-       << out_cl_tensor.memory_size();
-  if (this->pre_post_type_ == UINT8_255) {
-    memcpy(out->data<uint8_t>(), out_cl_tensor.Data<uint8_t>(),
-           sizeof(uint8_t) * out->numel());
-  } else {
-    memcpy(out->data<float>(), out_cl_tensor.Data<float>(),
-           sizeof(float) * out->numel());
-  }
-}
-
-template class FetchKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp b/mobile/src/operators/kernel/cl/flatten2_kernel.cpp
deleted file mode 100644
index 43eeffe0720cd64f32b993e88a680490f6bfe15c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/flatten2_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN2_OP
-
-#include "operators/kernel/flatten2_kernel.h"
-#include <operators/kernel/reshape_kernel.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Flatten2Kernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::FlattenParam<paddle_mobile::GPU_CL> *param) {
-  this->cl_helper_.AddKernel("flatten2", "flatten2_kernel.cl");
-  return true;
-}
-
-template <>
-void Flatten2Kernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::FlattenParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto input_image = input->GetCLImage();
-  auto output_image = output->GetCLImage();
-
-  int in_width = input->dims()[3];
-  int in_height = input->dims()[2];
-  int in_c = input->dims()[1];
-
-  int out_width = output->dims()[1];
-  DLOG << "flatten2 dims :" << output->dims() << " in: " << input->dims();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  DLOG << "flatten2 work size :" << default_work_size.data()[0] << " "
-       << default_work_size.data()[1] << "  " << default_work_size.data()[2]
-       << "   " << default_work_size.size();
-
-  // const size_t work_size[2] = {output->ImageWidth(), output->ImageHeight()};
-  DLOG << "flatten2 work data :" << output->ImageWidth() << "  "
-       << output->ImageHeight();
-
-  DLOG << "flatten2 work data 4:" << out_width << " " << in_width << "  "
-       << in_height << "   " << in_c;
-
-  int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &in_c);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
deleted file mode 100644
index de6a0455b9763890ca3fbf00af7bc25a43bb5a42..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "operators/math/math_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<GPU_CL, float>::Init(FusionFcParam<GPU_CL> *param) {
-  param->InputY()->InitNormalCLImage(cl_helper_.CLContext(),
-                                     this->cl_helper_.CLCommandQueue());
-  param->InputZ()->InitNormalCLImage(cl_helper_.CLContext(),
-                                     this->cl_helper_.CLCommandQueue());
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-template <typename P>
-void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
-                     cl_command_queue commandQueue, cl_kernel kernel0,
-                     cl_kernel kernel1) {
-  auto *input_x_image = param.InputX();
-  auto *input_y_image = param.InputY();
-  auto *input_z_image = param.InputZ();
-
-  int axis = param.Axis();
-  auto *out_image = param.Out();
-
-  Tensor *input_x = new Tensor();
-  input_x->Resize(input_x_image->dims());
-  input_x->mutable_data<float>();
-  framework::CLImageToTensor(input_x_image, input_x, context, commandQueue,
-                             kernel0);
-
-  Tensor *input_y = new Tensor();
-  input_y->Resize(input_y_image->dims());
-  input_y->mutable_data<float>();
-  framework::CLImageToTensor(input_y_image, input_y, context, commandQueue,
-                             kernel0);
-
-  Tensor *input_z = new Tensor();
-  input_z->Resize(input_z_image->dims());
-  input_z->mutable_data<float>();
-  framework::CLImageToTensor(input_z_image, input_z, context, commandQueue,
-                             kernel0);
-  auto *input_z_data = input_z->data<float>();
-
-  DLOG << *input_x;
-  DLOG << *input_y;
-  DLOG << *input_z;
-
-  Tensor *out = new Tensor();
-  out->Resize(out_image->dims());
-  out->mutable_data<float>();
-  auto *out_data = out->mutable_data<float>();
-
-  const Tensor x_matrix =
-      input_x->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
-          : *input_x;
-  const Tensor y_matrix =
-      input_y->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
-          : *input_y;
-  auto out_dim = out->dims();
-  if (out_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
-  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
-                        " out_dim.size must be 2.");
-  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
-
-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
-
-  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                             static_cast<float>(1), out, static_cast<float>(1),
-                             false);
-
-  //  out_image->InitEmptyImage(context, commandQueue, out->dims());
-  framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);
-
-  delete (input_x);
-  delete (input_y);
-  delete (input_z);
-  delete (out);
-  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-}
-
-template <>
-void FusionFcKernel<GPU_CL, float>::Compute(
-    const FusionFcParam<GPU_CL> &param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-  FusionFcCompute<float>(param, this->cl_helper_.CLContext(),
-                         this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/gen_code.py b/mobile/src/operators/kernel/cl/gen_code.py
deleted file mode 100644
index 888c06e9a4a6bfaceb6261d1536d109bf40a1e37..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/gen_code.py
+++ /dev/null
@@ -1,208 +0,0 @@
-#  Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import re
-import os
-import sys
-
-def gen_opencl_kernels():
-    source = """
-    #pragma
-    #ifdef PADDLE_MOBILE_CL
-    #include <map>
-    #include <string>
-    #include <vector>
-    namespace paddle_mobile {
-        // func name => source
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
-    %s
-        };
-        // file name => header
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_headers = {
-    %s
-        };
-    }
-    #endif
-    """
-
-    def string_to_hex(str):
-        hex_list = []
-        for i in range(len(code_str)):
-            hex_ = hex(ord(code_str[i]))
-            hex_list.append(hex_)
-        return hex_list
-
-    def clean_source(content):
-        new_content = re.sub(r"/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/", "", content, flags=re.DOTALL)
-        lines = new_content.split("\n")
-        new_lines = []
-        for i in range(len(lines)):
-            line = lines[i]
-            line = re.sub(r"//.*$", "", line)
-            line = line.strip()
-            if line == "":
-                continue
-            new_lines.append(line)
-        new_content = "\n".join(new_lines)
-        return new_content
-
-    infile = open("cl_kernel/cl_common.h", "r")
-    common_content = infile.read()
-    infile.close()
-    common_content = clean_source(common_content)
-
-    infile = open("cl_kernel/conv_kernel.inc.cl", "r")
-    inc_content = infile.read()
-    infile.close()
-    inc_content = clean_source(inc_content)
-
-    def get_header_raw(content):
-        lines = content.split("\n")
-        new_lines = []
-        for line in lines:
-            if "__kernel void" in line:
-                break
-            new_lines.append(line)
-        header = "\n".join(new_lines)
-        return header
-    common_header = get_header_raw(common_content)
-    inc_header = get_header_raw(inc_content)
-
-    def get_header(content):
-        lines = content.split("\n")
-        new_lines = []
-        for line in lines:
-            if "__kernel void" in line:
-                break
-            new_lines.append(line)
-        for i in range(len(new_lines)):
-            if "#include \"conv_kernel.inc.cl\"" in new_lines[i]:
-                new_lines[i] = inc_header
-        header = "\n".join(new_lines)
-        new_lines = header.split("\n")
-        for i in range(len(new_lines)):
-            if "#include \"cl_common.h\"" in new_lines[i]:
-                new_lines[i] = common_header
-        header = "\n".join(new_lines)
-        return header
-
-    def get_funcs(content):
-        funcs = {}
-        lines = content.split("\n")
-        first_kernel_idx = None
-        for i in range(len(lines)):
-            if "__kernel void" in lines[i]:
-                first_kernel_idx = i
-                break
-        if first_kernel_idx is None:
-            return funcs
-        lines = lines[first_kernel_idx:]
-        func = []
-        name = ""
-        for line in lines:
-            if "__kernel void" in line:
-                if name != "":
-                    funcs[name] = "\n".join(func)
-                    name = ""
-                    func = []
-                pattern = re.compile("__kernel void ([^(]+)\(")
-                match = pattern.search(line)
-                name = match.group(1)
-            func.append(line)
-        if name != "":
-            funcs[name] = "\n".join(func)
-            name = ""
-            func = []
-        return funcs
-
-    filenames = os.listdir("cl_kernel")
-    file_count = len(filenames)
-
-    headers = {}
-    funcs = {}
-    for i in range(file_count):
-        filename = filenames[i]
-        infile = open("cl_kernel/" + filename, "r")
-        content = infile.read()
-        infile.close()
-        content = clean_source(content)
-        header = get_header(content)
-        headers[filename] = header
-        funcs_temp = get_funcs(content)
-        for key in funcs_temp:
-            funcs[key] = funcs_temp[key]
-
-    core1 = ""
-    core2 = ""
-
-    for i in range(len(funcs)):
-        func_name = list(funcs.keys())[i]
-        content = funcs[func_name]
-        if content == "":
-            content = " "
-        hexes = []
-        for char in content:
-            hexes.append(hex(ord(char)))
-        core = "        {\"%s\", {" % func_name
-        for item in hexes:
-            core += str(item) + ", "
-        core = core[: -2]
-        core += "}}"
-        if i != len(funcs) - 1:
-            core += ",\n"
-        core1 += core
-
-    for i in range(len(headers)):
-        file_name = list(headers.keys())[i]
-        content = headers[file_name]
-        if content == "":
-            content = " "
-        hexes = []
-        for char in content:
-            hexes.append(hex(ord(char)))
-        core = "        {\"%s\", {" % file_name
-        for item in hexes:
-            core += str(item) + ", "
-        core = core[: -2]
-        core += "}}"
-        if i != len(headers) - 1:
-            core += ",\n"
-        core2 += core
-    source = source % (core1, core2)
-    print(source)
-
-def gen_empty_opencl_kernels():
-    source = """
-    #pragma
-    #ifdef PADDLE_MOBILE_CL
-    #include <map>
-    #include <string>
-    #include <vector>
-    namespace paddle_mobile {
-        // func name => source
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_kernels = {
-        };
-        // file name => header
-        extern const std::map<std::string, std::vector<unsigned char>> opencl_headers = {
-        };
-    }
-    #endif
-    """
-    print(source)
-
-if __name__ == "__main__":
-    if sys.argv[1] == "0":
-        gen_empty_opencl_kernels()
-    elif sys.argv[1] == "1":
-        gen_opencl_kernels()
diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
deleted file mode 100644
index 3a20ebd94e02ca489d9364ab3673d2bec866db2e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef GRID_SAMPLER_OP
-
-#include "operators/kernel/grid_sampler_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool GridSamplerKernel<GPU_CL, float>::Init(GridSamplerParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("grid_sampler", "grid_sampler_kernel.cl");
-  return true;
-}
-
-template <>
-void GridSamplerKernel<GPU_CL, float>::Compute(
-    const GridSamplerParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Output()));
-  cl_int status;
-  auto output = param.Output();
-  auto input = param.InputX();
-  auto grid = param.Grid();
-  auto output_image = output->GetCLImage();
-  auto input_image = input->GetCLImage();
-  auto grid_image = grid->GetCLImage();
-  const int out_H = output->dims()[2];
-  const int out_W = output->dims()[3];
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &grid_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-
-  const size_t work_size[3] = {default_work_size[0], default_work_size[1],
-                               default_work_size[2] / 4};
-
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class GridSamplerKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
deleted file mode 100644
index d0f377faee8667a43d3286309e95e8673d9a6a62..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#include "operators/kernel/instancenorm_kernel.h"
-#include <cmath>
-#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  auto &dims = param->OutputY()->dims();
-  const int h = dims[2];
-  std::string build_options = "";
-  if (h == 128) {
-    build_options = "-DLOCAL_MEM_128";
-  } else if (h == 64) {
-    build_options = "-DLOCAL_MEM_64";
-  }
-  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
-                             build_options);
-  return true;
-}
-
-template <>
-void InstanceNormKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
-               param.Epsilon());
-}
-
-template class InstanceNormKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
deleted file mode 100644
index bd1d1f87424d48be92777f7e7a72f08b66aa07c7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#include "operators/kernel/instancenorm_relu_kernel.h"
-#include <cmath>
-#include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool InstanceNormReluKernel<GPU_CL, float>::Init(
-    FusionInstanceNormReluParam<GPU_CL> *param) {
-  auto &dims = param->Out()->dims();
-  const int h = dims[2];
-  std::string build_options = " -DRELU";
-  if (h == 128) {
-    build_options += " -DLOCAL_MEM_128";
-  } else if (h == 64) {
-    build_options += " -DLOCAL_MEM_64";
-  }
-  this->cl_helper_.AddKernel("instancenorm", "instancenorm_kernel.cl",
-                             build_options);
-  return true;
-}
-
-template <>
-void InstanceNormReluKernel<GPU_CL, float>::Compute(
-    const FusionInstanceNormReluParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
-}
-
-template class InstanceNormReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp b/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
deleted file mode 100644
index 9487d57b2c996fff3170535df58dfeb6e6d66203..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/leakyrelu_kernel.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LEAKY_RELU_OP
-
-#include <operators/kernel/activation_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool LeakyReluKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL> *param) {
-  this->cl_helper_.AddKernel("leakyrelu", "leakyrelu_kernel.cl");
-  return true;
-}
-
-template <>
-void LeakyReluKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::LeakyReluParam<paddle_mobile::GPU_CL>
-        &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto input = param.InputX();
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.Out();
-  cl_mem out_image = output->GetCLImage();
-  float alpha = param.Alpha();
-  int out_dims_w = output->dims()[3];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(float), &alpha);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &out_dims_w);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-template class LeakyReluKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/lrn_kernel.cpp b/mobile/src/operators/kernel/cl/lrn_kernel.cpp
deleted file mode 100644
index e7e949e5ab5e8a8c8e17d76ee839767173251edc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/lrn_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#include "operators/kernel/lrn_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool LrnKernel<GPU_CL, float>::Init(LrnParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("lrn", "lrn_kernel.cl");
-  return true;
-}
-
-template <>
-void LrnKernel<GPU_CL, float>::Compute(const LrnParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-
-  auto input_image = param.InputX()->GetCLImage();
-  auto x_dims = param.InputX()->dims();
-  auto output_image = param.Out()->GetCLImage();
-
-  const int N = x_dims[0];
-  const int C = x_dims[1];
-  const int H = x_dims[2];
-  const int W = x_dims[3];
-
-  const int n = param.N();
-  const float alpha = param.Alpha();
-  const float beta = param.Beta();
-  const float k = param.K();
-  DLOG << "n=" << n;
-  DLOG << "alpha=" << alpha;
-  DLOG << "beta=" << beta;
-  DLOG << "k=" << k;
-  DLOG << default_work_size;
-  DLOG << C;
-  DLOG << W;
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &n);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(float), &k);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(float), &alpha);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(float), &beta);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/mul_kernel.cpp b/mobile/src/operators/kernel/cl/mul_kernel.cpp
deleted file mode 100644
index 3a45babee062ac415c1903e901488a73731f2e22..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/mul_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "operators/kernel/mul_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MulKernel<GPU_CL, float>::Init(MulParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-template <typename Dtype>
-void MulCompute(const MulParam<GPU_CL> &param, cl_context context,
-                cl_command_queue commandQueue, cl_kernel kernel0,
-                cl_kernel kernel1) {
-  auto input_x = param.InputX();
-  Tensor *input_x_tensor = new Tensor();
-  input_x_tensor->Resize(input_x->dims());
-  input_x_tensor->mutable_data<float>();
-
-  framework::CLImageToTensor(input_x, input_x_tensor, context, commandQueue,
-                             kernel0);
-
-  auto input_y = param.InputY();
-  Tensor input_y_tensor(input_y->data<float>(), input_y->dims());
-
-  const Tensor x_matrix =
-      input_x_tensor->dims().size() > 2
-          ? framework::ReshapeToMatrix(*input_x_tensor, param.XNumColDims())
-          : *input_x_tensor;
-  const Tensor y_matrix =
-      input_y_tensor.dims().size() > 2
-          ? framework::ReshapeToMatrix(input_y_tensor, param.YNumColDims())
-          : input_y_tensor;
-
-  auto out_dim = param.Out()->dims();
-  if (out_dim.size() != 2) {
-    param.Out()->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-  auto output = param.Out();
-  Tensor *output_tensor = new Tensor();
-  output_tensor->Resize(output->dims());
-  output_tensor->mutable_data<float>();
-  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                             static_cast<float>(1), output_tensor,
-                             static_cast<float>(0));
-
-  //  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
-  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
-                             kernel1);
-
-  delete (input_x_tensor);
-  delete (output_tensor);
-}
-
-template <>
-void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-
-  MulCompute<float>(param, this->cl_helper_.CLContext(),
-                    this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
-}
-
-template class MulKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp b/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
deleted file mode 100644
index ce435b899712c5060f7a63c864dddc0dbb86b671..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/multiclass_nms_kernel.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/kernel/multiclass_nms_kernel.h"
-#include <algorithm>
-#include "operators/math/poly_util.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool MultiClassNMSKernel<GPU_CL, float>::Init(
-    MultiClassNMSParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-template <class T>
-bool SortScorePairDescend(const std::pair<float, T>& pair1,
-                          const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-template <class T>
-static inline void GetMaxScoreIndex(
-    const std::vector<T>& scores, const T threshold, int top_k,
-    std::vector<std::pair<T, int>>* sorted_indices) {
-  for (size_t i = 0; i < scores.size(); ++i) {
-    if (scores[i] > threshold) {
-      sorted_indices->push_back(std::make_pair(scores[i], i));
-    }
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
-                   SortScorePairDescend<int>);
-  // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
-    sorted_indices->resize(top_k);
-  }
-}
-
-template <class T>
-static inline T BBoxArea(const T* box, const bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T* box1, const T* box2,
-                               const bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline T PolyIoU(const T* box1, const T* box2, const size_t box_size,
-                        const bool normalized) {
-  T bbox1_area = math::PolyArea<T>(box1, box_size, normalized);
-  T bbox2_area = math::PolyArea<T>(box2, box_size, normalized);
-  T inter_area = math::PolyOverlapArea<T>(box1, box2, box_size, normalized);
-  if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) {
-    // If coordinate values are is invalid
-    // if area size <= 0,  return 0.
-    return static_cast<T>(0.);
-  } else {
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline void NMSFast(const framework::Tensor& bbox,
-                           const framework::Tensor& scores,
-                           const T score_threshold, const T nms_threshold,
-                           const T eta, const int64_t top_k,
-                           std::vector<int>* selected_indices) {
-  // The total boxes for each instance.
-  int64_t num_boxes = bbox.dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox.dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices;
-  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
-
-  selected_indices->clear();
-  T adaptive_threshold = nms_threshold;
-  const T* bbox_data = bbox.data<T>();
-
-  while (sorted_indices.size() != 0) {
-    const int idx = sorted_indices.front().second;
-    bool keep = true;
-    for (size_t k = 0; k < selected_indices->size(); ++k) {
-      if (keep) {
-        const int kept_idx = (*selected_indices)[k];
-        T overlap = T(0.);
-        if (box_size == 4) {
-          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, true);
-        } else {
-          overlap = PolyIoU<T>(bbox_data + idx * box_size,
-                               bbox_data + kept_idx * box_size, box_size, true);
-        }
-        keep = overlap <= adaptive_threshold;
-      } else {
-        break;
-      }
-    }
-    if (keep) {
-      selected_indices->push_back(idx);
-    }
-    sorted_indices.erase(sorted_indices.begin());
-    if (keep && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-}
-
-template <typename T>
-void MultiClassNMS(const framework::Tensor& scores,
-                   const framework::Tensor& bboxes,
-                   std::map<int, std::vector<int>>* indices, int* num_nmsed_out,
-                   const int& background_label, const int& nms_top_k,
-                   const int& keep_top_k, const T& nms_threshold,
-                   const T& nms_eta, const T& score_threshold) {
-  int64_t class_num = scores.dims()[0];
-  int64_t predict_dim = scores.dims()[1];
-  int num_det = 0;
-  for (int64_t c = 0; c < class_num; ++c) {
-    if (c == background_label) continue;
-    framework::Tensor score = scores.Slice(c, c + 1);
-    /// [c] is key
-    NMSFast<float>(bboxes, score, score_threshold, nms_threshold, nms_eta,
-                   nms_top_k, &((*indices)[c]));
-    num_det += (*indices)[c].size();
-  }
-
-  *num_nmsed_out = num_det;
-  const T* scores_data = scores.data<T>();
-  if (keep_top_k > -1 && num_det > keep_top_k) {
-    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-    for (const auto& it : *indices) {
-      int label = it.first;
-      const T* sdata = scores_data + label * predict_dim;
-      const std::vector<int>& label_indices = it.second;
-      for (size_t j = 0; j < label_indices.size(); ++j) {
-        int idx = label_indices[j];
-        // PADDLE_ENFORCE_LT(idx, predict_dim);
-        score_index_pairs.push_back(
-            std::make_pair(sdata[idx], std::make_pair(label, idx)));
-      }
-    }
-    // Keep top k results per image.
-    std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
-                     SortScorePairDescend<std::pair<int, int>>);
-    score_index_pairs.resize(keep_top_k);
-
-    // Store the new indices.
-    std::map<int, std::vector<int>> new_indices;
-    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
-      int label = score_index_pairs[j].second.first;
-      int idx = score_index_pairs[j].second.second;
-      new_indices[label].push_back(idx);
-    }
-    new_indices.swap(*indices);
-    *num_nmsed_out = keep_top_k;
-  }
-}
-
-template <typename T>
-void MultiClassOutput(const framework::Tensor& scores,
-                      const framework::Tensor& bboxes,
-                      const std::map<int, std::vector<int>>& selected_indices,
-                      framework::Tensor* outs) {
-  int predict_dim = scores.dims()[1];
-  int box_size = bboxes.dims()[1];
-  int out_dim = bboxes.dims()[1] + 2;
-  auto* scores_data = scores.data<T>();
-  auto* bboxes_data = bboxes.data<T>();
-  auto* odata = outs->data<T>();
-
-  int count = 0;
-  for (const auto& it : selected_indices) {
-    /// one batch
-    int label = it.first;
-    const T* sdata = scores_data + label * predict_dim;
-    const std::vector<int>& indices = it.second;
-    for (size_t j = 0; j < indices.size(); ++j) {
-      int idx = indices[j];
-      const T* bdata = bboxes_data + idx * box_size;
-      odata[count * out_dim] = label;           // label
-      odata[count * out_dim + 1] = sdata[idx];  // score
-      // xmin, ymin, xmax, ymax
-      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
-      count++;
-    }
-  }
-}
-
-template <typename P>
-void MultiClassNMSCompute(const MultiClassNMSParam<GPU_CL>& param,
-                          cl_context context, cl_command_queue commandQueue,
-                          cl_kernel kernel0, cl_kernel kernel1) {
-  auto* input_bboxes_image = param.InputBBoxes();
-  auto& input_bboxes_dims = input_bboxes_image->dims();
-  Tensor* input_bboxes = new Tensor();
-  input_bboxes->Resize(input_bboxes_dims);
-  input_bboxes->mutable_data<float>();
-  DLOG << "yangfei20";
-  framework::CLImageToTensor(input_bboxes_image, input_bboxes, context,
-                             commandQueue, kernel0);
-  DLOG << "yangfei20";
-  auto* input_scores_image = param.InputScores();
-  auto& input_scores_dims = input_scores_image->dims();
-
-  Tensor* input_scores = new Tensor();
-  input_scores->Resize(input_scores_dims);
-  input_scores->mutable_data<float>();
-  framework::CLImageToTensor(input_scores_image, input_scores, context,
-                             commandQueue, kernel0);
-  DLOG << "yangfei20";
-  auto outs_image = param.Out();
-  Tensor* outs = new Tensor();
-  outs->Resize(outs_image->dims());
-  outs->mutable_data<float>();
-  DLOG << *input_bboxes;
-  DLOG << *input_scores;
-  DLOG << *outs;
-  auto background_label = param.BackGroundLabel();
-  auto nms_top_k = param.NMSTopK();
-  auto keep_top_k = param.KeepTopK();
-  auto nms_threshold = param.NMSThreshold();
-  auto nms_eta = param.NMSEta();
-  auto score_threshold = param.ScoreThreshold();
-
-  int64_t batch_size = input_scores_dims[0];
-  int64_t class_num = input_scores_dims[1];
-  int64_t predict_dim = input_scores_dims[2];
-  int64_t box_dim = input_bboxes_dims[2];
-
-  std::vector<std::map<int, std::vector<int>>> all_indices;
-  std::vector<size_t> batch_starts = {0};
-  for (int64_t i = 0; i < batch_size; ++i) {
-    framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-    ins_score.Resize({class_num, predict_dim});
-
-    framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-    ins_boxes.Resize({predict_dim, box_dim});
-
-    std::map<int, std::vector<int>> indices;
-    int num_nmsed_out = 0;
-    MultiClassNMS<float>(ins_score, ins_boxes, &indices, &num_nmsed_out,
-                         background_label, nms_top_k, keep_top_k, nms_threshold,
-                         nms_eta, score_threshold);
-    all_indices.push_back(indices);
-    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
-  }
-
-  int num_kept = batch_starts.back();
-  if (num_kept == 0) {
-    float* od = outs->mutable_data<float>({1});
-    od[0] = -1;
-  } else {
-    int64_t out_dim = box_dim + 2;
-    outs->mutable_data<float>({num_kept, out_dim});
-    for (int64_t i = 0; i < batch_size; ++i) {
-      framework::Tensor ins_score = input_scores->Slice(i, i + 1);
-      ins_score.Resize({class_num, predict_dim});
-
-      framework::Tensor ins_boxes = input_bboxes->Slice(i, i + 1);
-      ins_boxes.Resize({predict_dim, box_dim});
-
-      int64_t s = batch_starts[i];
-      int64_t e = batch_starts[i + 1];
-      if (e > s) {
-        framework::Tensor out = outs->Slice(s, e);
-        MultiClassOutput<float>(ins_score, ins_boxes, all_indices[i], &out);
-      }
-    }
-  }
-  DLOG << "yangfei20";
-  outs_image->InitEmptyImage(context, commandQueue, outs->dims());
-  framework::TensorToCLImage(outs, outs_image, context, commandQueue, kernel1);
-  DLOG << *outs;
-  delete (input_bboxes);
-  delete (input_scores);
-  delete (outs);
-  DLOG << "yangfei20";
-}
-template <>
-void MultiClassNMSKernel<GPU_CL, float>::Compute(
-    const MultiClassNMSParam<GPU_CL>& param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-  MultiClassNMSCompute<float>(param, this->cl_helper_.CLContext(),
-                              this->cl_helper_.CLCommandQueue(), kernel0,
-                              kernel1);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp b/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
deleted file mode 100644
index 285602757b7f60d39236acac759f6d3488f1d9cd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/nearest_interp_kernel.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#include <operators/kernel/nearest_interp_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool NearestInterpolationKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::NearestInterpolationParam<paddle_mobile::GPU_CL>
-        *param) {
-  this->cl_helper_.AddKernel("nearest_interp", "nearest_interp_kernel.cl");
-  return true;
-}
-
-template <>
-void NearestInterpolationKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::NearestInterpolationParam<
-        paddle_mobile::GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  auto input = param.InputX();
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.Out();
-  cl_mem output_image = output->GetCLImage();
-  float scale_h = output->dims()[2] / input->dims()[2];
-  float scale_w = output->dims()[3] / input->dims()[3];
-  int in_dims_h = input->dims()[2];
-  int out_dims_h = output->dims()[2];
-  int in_dims_w = input->dims()[3];
-  int out_dims_w = output->dims()[3];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 2, sizeof(float), &scale_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 3, sizeof(float), &scale_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 4, sizeof(int), &in_dims_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 5, sizeof(int), &out_dims_h);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_dims_w);
-  CL_CHECK_ERRORS(status)
-  status = clSetKernelArg(kernel, 7, sizeof(int), &out_dims_w);
-  CL_CHECK_ERRORS(status)
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status)
-}
-template class NearestInterpolationKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp b/mobile/src/operators/kernel/cl/pad2d_kernel.cpp
deleted file mode 100644
index 3999995b4ab2651c12b58ce3e25417d2104d6b4b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/pad2d_kernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#include "operators/kernel/pad2d_kernel.h"
-#include "framework/cl/cl_tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Pad2DKernel<GPU_CL, float>::Init(Pad2DParam<GPU_CL> *param) {
-  DLOG << "Init pad2d";
-  this->cl_helper_.AddKernel("pad2d", "pad2d_kernel.cl");
-  return true;
-}
-
-template <>
-void Pad2DKernel<GPU_CL, float>::Compute(const Pad2DParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  cl_int status;
-  auto output = param.Out();
-  auto input = param.InputX();
-  auto output_image = output->GetCLImage();
-  auto input_image = input->GetCLImage();
-  const int out_H = output->dims()[2];
-  const int out_W = output->dims()[3];
-  const int input_H = input->dims()[2];
-  const int input_W = input->dims()[3];
-  const auto &paddings = param.paddings_;
-  const int pad_top = paddings[0];
-  const int pad_bottom = paddings[1];
-  const int pad_left = paddings[2];
-  const int pad_right = paddings[3];
-  const float pad_value = param.pad_value_;
-  const auto &modeStr = param.mode_;
-  int mode = 0;
-  if (modeStr == "reflect") {
-    mode = 1;
-  } else if (modeStr == "edge") {
-    mode = 2;
-  }
-  DLOG << "input_H: " << input_H;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &input_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &input_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_bottom);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &pad_left);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &pad_right);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(cl_int), &mode);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(cl_float), &pad_value);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class Pad2DKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp b/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp
deleted file mode 100644
index faa90f9c4329d2450e15c220a68e3d675fb2eacc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/pixel_shuffle_kernel.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PIXEL_SHUFFLE_OP
-
-#include "operators/kernel/pixel_shuffle_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PixelShuffleKernel<GPU_CL, float>::Init(PixelShuffleParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("pixel_shuffle", "pixel_shuffle_kernel.cl");
-  return true;
-}
-
-template <>
-void PixelShuffleKernel<GPU_CL, float>::Compute(
-    const PixelShuffleParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-
-  auto input_image = param.InputX()->GetCLImage();
-  auto output_image = param.Out()->GetCLImage();
-  auto upscale_factor = param.upscale_factor();
-
-  int input_n = param.InputX()->dims()[0];
-  int input_c = param.InputX()->dims()[1];
-  int input_h = param.InputX()->dims()[2];
-  int input_w = param.InputX()->dims()[3];
-  int output_n = param.Out()->dims()[0];
-  int output_c = param.Out()->dims()[1];
-  int output_h = param.Out()->dims()[2];
-  int output_w = param.Out()->dims()[3];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &input_n);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &input_c);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &input_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &input_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &output_n);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &output_c);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &output_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &output_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &upscale_factor);
-  CL_CHECK_ERRORS(status);
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/pool_kernel.cpp b/mobile/src/operators/kernel/cl/pool_kernel.cpp
deleted file mode 100644
index 990f6ea67572043b4d09332ab0a1c82cdb8765f9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/pool_kernel.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<GPU_CL, float>::Init(PoolParam<GPU_CL> *param) {
-  std::string pooling_type = param->PoolingType();
-  this->cl_helper_.AddKernel("pool_" + pooling_type, "pool_kernel.cl");
-  return true;
-}
-
-template <>
-void PoolKernel<GPU_CL, float>::Compute(const PoolParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-
-  auto input = param.Input()->GetCLImage();
-  auto out = param.Output()->GetCLImage();
-
-  framework::CLImageConverterFolder *input_folder_converter =
-      reinterpret_cast<framework::CLImageConverterFolder *>(
-          param.Input()->Converter());
-  framework::CLImageConverterFolder *output_folder_converter =
-      reinterpret_cast<framework::CLImageConverterFolder *>(
-          param.Output()->Converter());
-
-  const int in_height = input_folder_converter->HeightOfOneBlock();
-  const int in_width = input_folder_converter->WidthOfOneBlock();
-  const int out_height = output_folder_converter->HeightOfOneBlock();
-  const int out_width = output_folder_converter->WidthOfOneBlock();
-
-  std::string pooling_type = param.PoolingType();
-  std::vector<int> ksize = param.Ksize();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-
-  if (param.isGlobalPooling()) {
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(param.Input()->dims()[i + 2]);
-    }
-  }
-
-  const int pad_top = paddings[0];
-  const int pad_left = paddings[1];
-  const int stride_h = strides[0];
-  const int stride_w = strides[1];
-  const int ksize_h = ksize[0];
-  const int ksize_w = ksize[1];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &in_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &in_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_int), &pad_top);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_int), &pad_left);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_int), &stride_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_int), &stride_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(cl_int), &ksize_h);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(cl_int), &ksize_w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(cl_mem), &input);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(cl_mem), &out);
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-  status =
-      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
-                             default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class PoolKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp b/mobile/src/operators/kernel/cl/prior_box_kernel.cpp
deleted file mode 100644
index c10bfed8d1a21d6578258a28259e883422342085..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/prior_box_kernel.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRIORBOX_OP
-
-#include "operators/kernel/prior_box_kernel.h"
-#include "framework/cl/cl_tensor.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PriorBoxKernel<GPU_CL, float>::Init(PriorBoxParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("prior_box", "prior_box_kernel.cl");
-  return true;
-}
-
-template <>
-void PriorBoxKernel<GPU_CL, float>::Compute(
-    const PriorBoxParam<GPU_CL> &param) {
-  const auto *input_ = param.Input();
-  const auto &input_dims = input_->dims();
-
-  const auto &input_image_dims = param.InputImage()->dims();
-
-  const auto &min_sizes = param.MinSizes();
-  const auto &max_sizes = param.MaxSizes();
-  const auto &variances = param.Variances();
-  const auto &input_aspect_ratio = param.AspectRatios();
-  const bool &flip = param.Flip();
-  const bool &clip = param.Clip();
-  int isclip = 0;
-  if (clip) {
-    isclip = 1;
-  }
-  const float &step_w = param.StepW();
-  const float &step_h = param.StepH();
-  const float &offset = param.Offset();
-  const int C = param.OutputBoxes()->dims()[1];
-
-  auto output_boxes = param.OutputBoxes()->GetCLImage();
-  auto output_variances = param.OutputVariances()->GetCLImage();
-
-  std::vector<float> aspect_ratios;
-  ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
-
-  auto img_width = input_image_dims[3];
-  auto img_height = input_image_dims[2];
-
-  auto feature_width = input_dims[3];
-  auto feature_height = input_dims[2];
-
-  float step_width, step_height;
-  /// 300 / 19
-  if (step_w == 0 || step_h == 0) {
-    step_width = static_cast<float>(img_width) / feature_width;
-    step_height = static_cast<float>(img_height) / feature_height;
-  } else {
-    step_width = step_w;
-    step_height = step_h;
-  }
-
-  int num_priors = aspect_ratios.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  float *box_width = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
-  float *box_height = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * num_priors));
-  float *variancesptr =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * 4));
-  int idx = 0;
-  for (size_t s = 0; s < min_sizes.size(); ++s) {
-    auto min_size = min_sizes[s];
-    if (param.MinMaxAspectRatiosOrder()) {
-      box_width[idx] = box_height[idx] = min_size / 2.;
-      idx++;
-      if (max_sizes.size() > 0) {
-        auto max_size = max_sizes[s];
-        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
-        idx++;
-      }
-      for (float ar : aspect_ratios) {
-        if (fabs(ar - 1.) < 1e-6) {
-          continue;
-        }
-        box_width[idx] = min_size * sqrt(ar) / 2.;
-        box_height[idx] = min_size / sqrt(ar) / 2.;
-        idx++;
-      }
-
-    } else {
-      for (float ar : aspect_ratios) {
-        box_width[idx] = min_size * sqrt(ar) / 2.;
-        box_height[idx] = min_size / sqrt(ar) / 2.;
-        idx++;
-      }
-      if (!max_sizes.empty()) {
-        auto max_size = max_sizes[s];
-        box_width[idx] = box_height[idx] = sqrt(min_size * max_size) / 2.;
-        idx++;
-      }
-    }
-  }
-  for (int i = 0; i < variances.size(); i++) {
-    variancesptr[i] = variances[i];
-  }
-  cl_int status;
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size =
-      this->cl_helper_.DefaultWorkSize(*param.OutputBoxes());
-  auto c_block = default_work_size[0];
-  auto w = default_work_size[1];
-  auto nh = default_work_size[2];
-
-  std::vector<int64_t> box_shape({num_priors});
-  framework::DDim ddim = framework::make_ddim(box_shape);
-
-  framework::CLTensor box_width_cl_tensor(this->cl_helper_.CLContext(),
-                                          this->cl_helper_.CLCommandQueue());
-  box_width_cl_tensor.Resize(ddim);
-  cl_mem box_width_Buffer =
-      box_width_cl_tensor.mutable_with_data<float>(box_width);
-
-  framework::CLTensor box_height_cl_tensor(this->cl_helper_.CLContext(),
-                                           this->cl_helper_.CLCommandQueue());
-  box_height_cl_tensor.Resize(ddim);
-  cl_mem box_height_Buffer =
-      box_height_cl_tensor.mutable_with_data<float>(box_height);
-
-  framework::CLTensor variances_cl_tensor(this->cl_helper_.CLContext(),
-                                          this->cl_helper_.CLCommandQueue());
-
-  std::vector<int64_t> variances_shape({4});
-  framework::DDim vddim = framework::make_ddim(variances_shape);
-
-  variances_cl_tensor.Resize(vddim);
-  cl_mem variances_Buffer =
-      variances_cl_tensor.mutable_with_data<float>(variancesptr);
-
-  //            DLOG << "c_block:" << c_block;
-  //            DLOG << "w:" << w;
-  //            DLOG << "nh:" << nh;
-  //            DLOG << "step_width:" << step_width;
-  //            DLOG << "step_height:" << step_height;
-  //            DLOG << "offset:" << offset;
-  //            DLOG << "img_width:" << img_width;
-  //            DLOG << "img_height:" << img_height;
-  //            DLOG << "num_priors:" << num_priors;
-  //            DLOG << "C:" << C;
-  //            DLOG << "isclip:" << isclip;
-  //            printf("param.MinMaxAspectRatiosOrder() =
-  //            %d\n",param.MinMaxAspectRatiosOrder()); for (int i = 0; i <
-  //            num_priors; i++) {
-  //                DLOG << box_width[i];
-  //                DLOG << box_height[i];
-  //            }
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &box_width_Buffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &box_height_Buffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &variances_Buffer);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &output_boxes);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_variances);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(float), &step_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(float), &step_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(float), &offset);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &img_width);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &img_height);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &num_priors);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 14, sizeof(int), &C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 15, sizeof(int), &isclip);
-  CL_CHECK_ERRORS(status);
-  size_t global_work_size[2] = {c_block, nh};
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, global_work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-
-  paddle_mobile::memory::Free(box_width);
-  paddle_mobile::memory::Free(box_height);
-  paddle_mobile::memory::Free(variancesptr);
-}
-template class PriorBoxKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/relu6_kernel.cpp b/mobile/src/operators/kernel/cl/relu6_kernel.cpp
deleted file mode 100644
index 20a6d9815b107c0c4b3ebdd28e40dca669c5fea9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/relu6_kernel.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Relu6Kernel<GPU_CL, float>::Init(Relu6Param<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("relu6", "relu6.cl");
-  return true;
-}
-
-template <>
-void Relu6Kernel<GPU_CL, float>::Compute(const Relu6Param<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  float threshold = param.getThreshold();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(float), &threshold);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-
-  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2, NULL,
-                         work_size, NULL, 0, NULL, NULL);
-}
-
-template class Relu6Kernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/relu_kernel.cpp b/mobile/src/operators/kernel/cl/relu_kernel.cpp
deleted file mode 100644
index f166963d946e85d5923b54980cfbd265c1b6560d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/relu_kernel.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReluKernel<GPU_CL, float>::Init(ReluParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("relu", "relu.cl");
-  //  this->cl_helper_.AddKernel("relu_p0", "relu.cl");
-  //  this->cl_helper_.AddKernel("relu_p1", "relu.cl");
-  //  const auto dim =
-  //      const_cast<framework::CLImage*>(param->InputX())->ImageDims();
-  //  param->getMidImage().InitEmptyImage(this->cl_helper_.CLContext(),
-  //                                      this->cl_helper_.CLCommandQueue(),
-  //                                      dim);
-  return true;
-}
-
-template <>
-void ReluKernel<GPU_CL, float>::Compute(const ReluParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  //  auto kernel_p0 = this->cl_helper_.KernelAt(1);
-  //  auto kernel_p1 = this->cl_helper_.KernelAt(2);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  //  auto tImage =
-  //      const_cast<ReluParam<GPU_CL>&>(param).getMidImage().GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &inputImage);
-  //  clSetKernelArg(kernel_p0, 0, sizeof(cl_mem), &tImage);
-  //  clSetKernelArg(kernel_p1, 0, sizeof(cl_mem), &tImage);
-  //  clSetKernelArg(kernel_p1, 1, sizeof(cl_mem), &outputImage);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-
-  //  cl_event out_event = param.Out()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-  //  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel_p1, 3,
-  //  NULL,
-  //                         work_size, NULL, 0, NULL, NULL);
-}
-
-template class ReluKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp b/mobile/src/operators/kernel/cl/reshape2_kernel.cpp
deleted file mode 100644
index 7dbea06a5167cefb3958081a35ffcc3791fb1663..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/reshape2_kernel.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<GPU_CL, float>::Init(Reshape2Param<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("reshape", "reshape.cl");
-  return true;
-}
-
-inline framework::DDim ValidateShape(const std::vector<int> shape,
-                                     const framework::DDim &in_dims) {
-  const int64_t in_size = framework::product(in_dims);
-  // only one dimension can be set to -1, whose size will be automatically
-  // infered.
-  const int64_t unk_dim_val = -1;
-  const int64_t copy_dim_val = 0;
-
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  int64_t capacity = 1;
-  int unk_dim_idx = -1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          unk_dim_idx == -1,
-          "Only one input dimension of Attr(shape) can be unknown.");
-      unk_dim_idx = i;
-    } else if (shape[i] == copy_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          static_cast<int>(i) < in_dims.size(),
-          "The index of dimension to copy from input shape must be less "
-          "than the size of input shape.");
-    } else {
-      PADDLE_MOBILE_ENFORCE(
-          shape[i] > 0,
-          "Each input dimension of Attr(shape) must not be negtive except "
-          "one unknown dimension.");
-    }
-
-    capacity *= (shape[i] ? shape[i] : in_dims[i]);
-    output_shape[i] = (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-  }
-
-  if (unk_dim_idx != -1) {
-    output_shape[unk_dim_idx] = -in_size / capacity;
-    PADDLE_MOBILE_ENFORCE(output_shape[unk_dim_idx] * capacity == -in_size,
-                          "Invalid shape is given.");
-  } else {
-    PADDLE_MOBILE_ENFORCE(capacity == in_size, "Invalid shape is given.");
-  }
-  return framework::make_ddim(output_shape);
-}
-
-template <>
-void Reshape2Kernel<GPU_CL, float>::Compute(
-    const Reshape2Param<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto input_image = input->GetCLImage();
-  auto output_image = output->GetCLImage();
-  const auto &inputDim = input->dims();
-  const auto &outputDim = output->dims();
-  int input_dims[4] = {1, 1, 1, 1};
-  int output_dims[4] = {1, 1, 1, 1};
-  // 1 1000 1 1
-  for (int i = 0; i < inputDim.size(); i++) {
-    input_dims[4 - inputDim.size() + i] = inputDim[i];
-  }
-
-  // 1 1 1 1000
-  for (int i = 0; i < outputDim.size(); i++) {
-    output_dims[4 - outputDim.size() + i] = outputDim[i];
-  }
-
-  int out_C = output_dims[1];
-  int out_H = output_dims[2];
-  int out_W = output_dims[3];
-  int in_W = input_dims[3];
-  int in_H = input_dims[2];
-  int in_Stride0 = in_W;
-  int in_Stride1 = input_dims[2] * input_dims[3];
-  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
-  int out_Stride0 = out_W;
-  int out_Stride1 = out_H * out_W;
-  int out_Stride2 = out_C * out_H * out_W;
-  DLOG << "out_C=" << out_C;
-  DLOG << "out_H=" << out_H;
-  DLOG << "out_W=" << out_W;
-  DLOG << "in_W=" << in_W;
-  DLOG << "default_work_size=" << default_work_size;
-  DLOG << "in_Stride0=" << in_Stride0;
-  DLOG << "in_Stride1=" << in_Stride1;
-  DLOG << "out_Stride0=" << out_Stride0;
-  DLOG << "out_Stride1=" << out_Stride1;
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class Reshape2Kernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/reshape_kernel.cpp b/mobile/src/operators/kernel/cl/reshape_kernel.cpp
deleted file mode 100644
index 18d98b0ff9fbe16f27d73b43ea98420ac5d1a2a5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/reshape_kernel.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<GPU_CL, float>::Init(ReshapeParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("reshape", "reshape.cl");
-  return true;
-}
-
-template <>
-void ReshapeKernel<GPU_CL, float>::Compute(const ReshapeParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto input_image = input->GetCLImage();
-  auto output_image = output->GetCLImage();
-  const auto &inputDim = input->dims();
-  const auto &outputDim = output->dims();
-  int input_dims[4] = {1, 1, 1, 1};
-  int output_dims[4] = {1, 1, 1, 1};
-  // 1 1000 1 1
-  for (int i = 0; i < inputDim.size(); i++) {
-    input_dims[4 - inputDim.size() + i] = inputDim[i];
-  }
-
-  // 1 1 1 1000
-  for (int i = 0; i < outputDim.size(); i++) {
-    output_dims[4 - outputDim.size() + i] = outputDim[i];
-  }
-
-  int out_C = output_dims[1];
-  int out_H = output_dims[2];
-  int out_W = output_dims[3];
-  int in_W = input_dims[3];
-  int in_H = input_dims[2];
-  int in_Stride0 = in_W;
-  int in_Stride1 = input_dims[2] * input_dims[3];
-  int in_Stride2 = input_dims[1] * input_dims[2] * input_dims[3];
-  int out_Stride0 = out_W;
-  int out_Stride1 = out_H * out_W;
-  int out_Stride2 = out_C * out_H * out_W;
-  DLOG << "out_C=" << out_C;
-  DLOG << "out_H=" << out_H;
-  DLOG << "out_W=" << out_W;
-  DLOG << "in_W=" << in_W;
-  DLOG << "default_work_size=" << default_work_size;
-  DLOG << "in_Stride0=" << in_Stride0;
-  DLOG << "in_Stride1=" << in_Stride1;
-  DLOG << "out_Stride0=" << out_Stride0;
-  DLOG << "out_Stride1=" << out_Stride1;
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &in_H);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &in_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &in_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &in_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &out_Stride0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &out_Stride1);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &out_Stride2);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class ReshapeKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/scale_kernel.cpp b/mobile/src/operators/kernel/cl/scale_kernel.cpp
deleted file mode 100644
index 4ab2be7c3fdd304a6c082b92180e931888855a82..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/scale_kernel.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "operators/kernel/scale_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ScaleKernel<GPU_CL, float>::Init(ScaleParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("scale", "scale_kernel.cl");
-  return true;
-}
-
-template <>
-void ScaleKernel<GPU_CL, float>::Compute(const ScaleParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  const float scale = param.Scale();
-  const float bias = param.Bias();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  int out_width = (output->dims().size() == 4) ? output->dims()[3] : 1;
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(float), &scale);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(float), &bias);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &out_width);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class ScaleKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp b/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
deleted file mode 100644
index 33ce051f4ad60dc3bb8dc9871089d9221406f03d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/sigmoid_kernel.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef SIGMOID_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SigmoidKernel<GPU_CL, float>::Init(SigmoidParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("sigmoid", "sigmoid.cl");
-  return true;
-}
-
-template <>
-void SigmoidKernel<GPU_CL, float>::Compute(const SigmoidParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class SigmoidKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/slice_kernel.cpp b/mobile/src/operators/kernel/cl/slice_kernel.cpp
deleted file mode 100644
index 446d003219d553a224cd5f144ad72d6392237a65..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/slice_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include <framework/cl/cl_tensor.h>
-#include <operators/kernel/slice_kernel.h>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool SliceKernel<GPU_CL, float>::Init(
-    paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> *param) {
-  this->cl_helper_.AddKernel("slice", "slice_kernel.cl");
-  return true;
-}
-
-template <>
-void SliceKernel<GPU_CL, float>::Compute(
-    const paddle_mobile::operators::SliceParam<paddle_mobile::GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.output_);
-  auto input = param.input_;
-  cl_mem input_image = input->GetCLImage();
-  auto output = param.output_;
-  cl_mem output_image = output->GetCLImage();
-  int starts_0 = param.starts_[0];
-  int ends_0 = param.ends_[0];
-  int axes_0 = param.axes_[0] - (param.original_output_dims_size_ -
-                                 param.output_->dims().size());
-  int dims_w = input->dims()[axes_0 + 2];
-
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &starts_0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 3, sizeof(int), &ends_0);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 4, sizeof(int), &dims_w);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-template class SliceKernel<GPU_CL, float>;
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/softmax_kernel.cpp b/mobile/src/operators/kernel/cl/softmax_kernel.cpp
deleted file mode 100644
index 6447b68d3376a23e89df6ee635537a8c2ab3bde8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/softmax_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<GPU_CL, float>::Init(SoftmaxParam<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("softmax", "softmax.cl");
-  return true;
-}
-
-template <>
-void SoftmaxKernel<GPU_CL, float>::Compute(const SoftmaxParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
-  const auto *input = param.InputX();
-  auto *output = param.Out();
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  const auto &outputDim = output->dims();
-
-  int dims[4] = {1, 1, 1, 1};
-
-  for (int i = 0; i < outputDim.size(); i++) {
-    dims[4 - outputDim.size() + i] = outputDim[i];
-  }
-
-  const int out_W = dims[3];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
-  CL_CHECK_ERRORS(status);
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class SoftmaxKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/split_kernel.cpp b/mobile/src/operators/kernel/cl/split_kernel.cpp
deleted file mode 100644
index 58c7361bc5ff5430ce54a8d8bca323fbbe7f9f2a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/split_kernel.cpp
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SplitKernel<GPU_CL, float>::Init(SplitParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-// Strided numel memory copy from src to dst by the specified axis
-//
-// For example, for a tensor dims [4, 20, 100], the strieded numel is
-// [8000, 2000, 100]
-//
-// NOTE: The src and dst tensor should have the same elements
-// except the specified axis.
-template <typename T>
-void StridedNumelCopyWithAxis(int64_t axis, T* dst,
-                              const framework::DDim& dst_stride_numel,
-                              const T* src,
-                              const framework::DDim& src_stride_numel,
-                              int64_t size) {
-  int64_t before = dst_stride_numel[0] / dst_stride_numel[axis];
-  int64_t src_after = src_stride_numel[axis];
-  int64_t dst_after = dst_stride_numel[axis];
-
-  PADDLE_MOBILE_ENFORCE(src_stride_numel.size() == dst_stride_numel.size(),
-                        "src and dst tensor should have the same dims size.");
-
-  for (int64_t i = 0; i < axis; ++i) {
-    if (i < axis) {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] / src_stride_numel[axis] ==
-                                dst_stride_numel[i] / dst_stride_numel[axis],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    } else if (i == axis) {
-      continue;
-    } else {
-      PADDLE_MOBILE_ENFORCE(src_stride_numel[i] == dst_stride_numel[i],
-                            "src and dst should have the same elements "
-                            "except the specified axis.");
-    }
-  }
-
-  for (int64_t i = 0; i < before; ++i) {
-    memory::Copy(dst + i * dst_after, src + i * src_after, sizeof(T) * size);
-  }
-}
-
-template <>
-void SplitKernel<GPU_CL, float>::Compute(const SplitParam<GPU_CL>& param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-  auto* input_image = param.InputX();
-  auto in_stride = framework::stride_numel(input_image->dims());
-  auto input_dims = input_image->dims();
-  auto outs_images = param.Outs();
-  int64_t axis = param.Axis();
-
-  Tensor* input_tensor = new Tensor();
-  input_tensor->Resize(input_image->dims());
-  input_tensor->mutable_data<float>();
-
-  framework::CLImageToTensor(input_image, input_tensor,
-                             this->cl_helper_.CLContext(),
-                             this->cl_helper_.CLCommandQueue(), kernel0);
-
-  size_t input_offset = 0;
-  for (auto out : outs_images) {
-    auto out_stride = framework::stride_numel(out->dims());
-
-    Tensor* temp_out = new Tensor();
-    temp_out->Resize(out->dims());
-    temp_out->mutable_data<float>();
-    framework::CLImageToTensor(out, temp_out, this->cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue(), kernel0);
-    StridedNumelCopyWithAxis<float>(axis, temp_out->data<float>(), out_stride,
-                                    input_tensor->data<float>() + input_offset,
-                                    in_stride, out_stride[axis]);
-    input_offset += out_stride[axis];
-    out->InitEmptyImage(this->cl_helper_.CLContext(),
-                        this->cl_helper_.CLCommandQueue(), temp_out->dims());
-    framework::TensorToCLImage(temp_out, out, this->cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue(), kernel1);
-    outs_images.push_back(out);
-
-    delete (temp_out);
-  }
-  delete (input_tensor);
-}
-
-template class SplitKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/tanh_kernel.cpp b/mobile/src/operators/kernel/cl/tanh_kernel.cpp
deleted file mode 100644
index 5c63a3606dab53a6f3c85ab61302db357a4cecb1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/tanh_kernel.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TANH_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TanhKernel<GPU_CL, float>::Init(TanhParam<GPU_CL>* param) {
-  this->cl_helper_.AddKernel("tanh_kernel", "tanh_kernel.cl");
-  return true;
-}
-
-template <>
-void TanhKernel<GPU_CL, float>::Compute(const TanhParam<GPU_CL>& param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  const auto* input = param.InputX();
-  auto* output = param.Out();
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*output);
-  auto inputImage = input->GetCLImage();
-  auto outputImage = output->GetCLImage();
-  cl_int status;
-  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImage);
-  CL_CHECK_ERRORS(status);
-  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImage);
-  CL_CHECK_ERRORS(status);
-  const size_t work_size[2] = {input->ImageWidth(), input->ImageHeight()};
-
-  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                                  NULL, work_size, NULL, 0, NULL, NULL);
-  CL_CHECK_ERRORS(status);
-}
-
-template class TanhKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp b/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
deleted file mode 100644
index 248eb3d12e0b87ac812f1ed8f3b26889ce099c2d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/transpose2_kernel.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Transpose2Kernel<GPU_CL, float>::Init(Transpose2Param<GPU_CL> *param) {
-  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
-  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
-  return true;
-}
-
-inline bool IsShuffleChannel(const std::vector<int> &axis) {
-  bool is_shuffle_channel = true;
-  if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
-    for (int i = 3; i < axis.size(); ++i) {
-      if (axis[i] != i) {
-        is_shuffle_channel = false;
-        break;
-      }
-    }
-  } else {
-    return false;
-  }
-  return is_shuffle_channel;
-}
-
-template <typename Dtype>
-void ShuffleChannelCompute(const Transpose2Param<GPU_CL> &param,
-                           cl_context context, cl_command_queue commandQueue,
-                           cl_kernel kernel0, cl_kernel kernel1) {
-  auto axis = param.Axis();
-  int axis_size = axis.size();
-
-  bool shouldResize = true;
-  int diff_dim = 0;
-  if (axis_size > 4) {
-    for (int i = 0; i < axis_size - 4; ++i) {
-      if (axis[i] != i) {
-        shouldResize = false;
-        break;
-      } else {
-        diff_dim++;
-      }
-    }
-    if (shouldResize) {
-      std::vector<int> temp_axis_dims;
-      temp_axis_dims.reserve(static_cast<size_t>(4));
-      for (int i = axis_size - 4; i < axis_size; ++i) {
-        temp_axis_dims.push_back(axis[i] - diff_dim);
-      }
-      axis.resize(4);
-      axis.clear();
-      axis.insert(axis.begin(), temp_axis_dims.begin(), temp_axis_dims.end());
-    }
-  }
-
-  auto input = param.InputX();
-  Tensor *input_tensor = new Tensor();
-  input_tensor->Resize(input->dims());
-  input_tensor->mutable_data<float>();
-
-  framework::CLImageToTensor(input, input_tensor, context, commandQueue,
-                             kernel0);
-  const Dtype *input_ptr = input_tensor->data<Dtype>();
-
-  auto output = param.Out();
-  Tensor *output_tensor = new Tensor();
-  framework::DDim out_dims(input->dims());
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = input->dims()[axis[i]];
-  }
-  output_tensor->Resize(out_dims);
-  output_tensor->mutable_data<float>();
-  Dtype *output_ptr = output_tensor->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-  size_t offset = 1;
-  for (int i = 3; i < axis.size(); ++i) {
-    offset *= in_dim[i];
-  }
-
-#pragma omp parallel for collapse(3)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int c1 = 0; c1 < out_dim[1]; ++c1) {
-      for (int c2 = 0; c2 < out_dim[2]; ++c2) {
-        size_t out_offset =
-            ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
-        size_t in_offset = ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
-        memcpy(output_ptr + out_offset, input_ptr + in_offset,
-               offset * sizeof(Dtype));
-      }
-    }
-  }
-
-  output->InitEmptyImage(context, commandQueue, output_tensor->dims());
-  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
-                             kernel1);
-
-  delete (input_tensor);
-  delete (output_tensor);
-}
-
-template <typename Dtype>
-void Transpose2Compute(const Transpose2Param<GPU_CL> &param, cl_context context,
-                       cl_command_queue commandQueue, cl_kernel kernel0,
-                       cl_kernel kernel1) {
-  const std::vector<int> &axis = param.Axis();
-
-  auto input = param.InputX();
-  Tensor *input_tensor = new Tensor();
-  input_tensor->Resize(input->dims());
-  input_tensor->mutable_data<float>();
-  framework::CLImageToTensor(input, input_tensor, context, commandQueue,
-                             kernel0);
-  const Dtype *input_ptr = input_tensor->data<Dtype>();
-
-  auto output = param.Out();
-  Tensor *output_tensor = new Tensor();
-  output_tensor->Resize(input->dims());
-  output_tensor->mutable_data<float>();
-  Dtype *output_ptr = output_tensor->mutable_data<Dtype>();
-  // input and output's shape dimension must >= 2 && <= 6.
-  const framework::DDim &in_dim = input->dims();
-  const framework::DDim &out_dim = output->dims();
-
-  // precompute inverted output dim and strides
-  size_t rout_dim[6], strides[6];
-  int permute = axis.size();  // permute must >=2 && <= 6.
-  for (int i = 0; i < permute; ++i) {
-    int k = permute - 1 - i;
-    strides[k] = 1;
-    for (int j = axis[i] + 1; j < permute; ++j) {
-      strides[k] *= in_dim[j];
-    }
-    rout_dim[k] = out_dim[i];
-  }
-  // unroll the first 2 dimensions
-  int reamin_dim = 1;
-  for (int i = 2; i < out_dim.size(); ++i) {
-    reamin_dim *= out_dim[i];
-  }
-
-#pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < out_dim[0]; ++batch) {
-    for (int j = 0; j < out_dim[1]; ++j) {
-      size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
-      Dtype *out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
-      int indics[4] = {0, 0, 0, 0};
-      for (int k = 0; k < reamin_dim; ++k) {
-        out_ptr[k] = input_ptr[offset];
-        indics[0] += 1;
-        offset += strides[0];
-        for (int p = 0; p < permute - 3; ++p) {
-          if (indics[p] == rout_dim[p]) {
-            indics[p + 1] += 1;
-            indics[p] = 0;
-            offset += strides[p + 1];
-            offset -= rout_dim[p] * strides[p];
-          } else {
-            break;
-          }
-        }
-      }
-    }
-  }
-
-  // output->InitEmptyImage(context, commandQueue, output_tensor->dims());
-  framework::TensorToCLImage(output_tensor, output, context, commandQueue,
-                             kernel1);
-  delete (input_tensor);
-  delete (output_tensor);
-}
-
-template <>
-void Transpose2Kernel<GPU_CL, float>::Compute(
-    const Transpose2Param<GPU_CL> &param) {
-  auto kernel0 = this->cl_helper_.KernelAt(0);
-  auto kernel1 = this->cl_helper_.KernelAt(1);
-
-  const std::vector<int> &axis = param.Axis();
-  bool shuffle_channel = IsShuffleChannel(axis);
-  if (shuffle_channel) {
-    DLOG << "transpose shuffle_channel .. ";
-    ShuffleChannelCompute<float>(param, this->cl_helper_.CLContext(),
-                                 this->cl_helper_.CLCommandQueue(), kernel0,
-                                 kernel1);
-  } else {
-    DLOG << "transpose 2 compute .. ";
-    Transpose2Compute<float>(param, this->cl_helper_.CLContext(),
-                             this->cl_helper_.CLCommandQueue(), kernel0,
-                             kernel1);
-  }
-
-  DLOG << "transpose end .. ";
-}
-
-template class Transpose2Kernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/cl/transpose_kernel.cpp b/mobile/src/operators/kernel/cl/transpose_kernel.cpp
deleted file mode 100644
index d3133449b916193b9bdb57f5398e7a1082a65749..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/cl/transpose_kernel.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE_OP
-
-#include "operators/kernel/transpose_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TransposeKernel<GPU_CL, float>::Init(TransposeParam<GPU_CL> *param) {
-  if (param->Out()->dims().size() == 4) {
-    this->cl_helper_.AddKernel("transpose_4d", "transpose_kernel.cl");
-  } else if (param->Out()->dims().size() < 4) {
-    this->cl_helper_.AddKernel("transpose", "transpose_kernel.cl");
-  }
-  return true;
-}
-
-template <>
-void TransposeKernel<GPU_CL, float>::Compute(
-    const TransposeParam<GPU_CL> &param) {
-  if (param.Out()->dims().size() == 4) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    int out_C = param.Out()->dims()[1];
-    int out_H = param.Out()->dims()[2];
-    int out_W = param.Out()->dims()[3];
-    int in_W = param.InputX()->dims()[3];
-    auto output_image = param.Out()->GetCLImage();
-    auto input_image = param.InputX()->GetCLImage();
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "out_W=" << out_W;
-    DLOG << "in_C=" << in_W;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else if (param.Out()->dims().size() == 3) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    int out_C = param.Out()->dims()[0];
-    int out_H = param.Out()->dims()[1];
-    int out_W = param.Out()->dims()[2];
-    int in_W = param.InputX()->dims()[2];
-    auto output_image = param.Out()->GetCLImage();
-    auto input_image = param.InputX()->GetCLImage();
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "out_W=" << out_W;
-    DLOG << "in_C=" << in_W;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-
-  } else if (param.Out()->dims().size() == 2) {
-    auto kernel = this->cl_helper_.KernelAt(0);
-    auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
-    int out_C = 1;
-    int out_H = param.Out()->dims()[0];
-    int out_W = param.Out()->dims()[1];
-    int in_W = param.InputX()->dims()[1];
-    auto output_image = param.Out()->GetCLImage();
-    auto input_image = param.InputX()->GetCLImage();
-    DLOG << "out_C=" << out_C;
-    DLOG << "out_H=" << out_H;
-    DLOG << "out_W=" << out_W;
-    DLOG << "in_C=" << in_W;
-    DLOG << "default_work_size=" << default_work_size;
-    cl_int status;
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(int), &in_W);
-    CL_CHECK_ERRORS(status);
-    status = clEnqueueNDRangeKernel(
-        this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/compare_kernel.h b/mobile/src/operators/kernel/compare_kernel.h
deleted file mode 100644
index 8932ca7757339717c13e16633b112adbda9ba2b2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/compare_kernel.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LESS_THAN_OP
-DECLARE_KERNEL(LessThan, CompareParam);
-#endif  // LESS_THAN_OP
-
-#ifdef EQUAL_OP
-DECLARE_KERNEL(Equal, CompareParam);
-#endif  // EQUAL_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/concat_kernel.h b/mobile/src/operators/kernel/concat_kernel.h
deleted file mode 100644
index ac9ebca4d5ab30307303b8720677e67470634b44..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/concat_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#pragma once
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ConcatKernel
-    : public framework::OpKernelBase<DeviceType, ConcatParam<DeviceType>> {
- public:
-  void Compute(const ConcatParam<DeviceType> &param);
-  bool Init(ConcatParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conditional_block_kernel.h b/mobile/src/operators/kernel/conditional_block_kernel.h
deleted file mode 100644
index 851d558c2c96fe868df7f5dd93556323a685a2d1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conditional_block_kernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONDITIONAL_BLOCK_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class ConditionalBlockParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConditionalBlockParam(const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetMultiVarValue<GType>("Input", inputs, *scope);
-    cond_ = OpParam::GetMultiVarValue<GType>("Cond", inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    scope_ = OpParam::GetVar("Scope", outputs, *scope);
-    is_scalar_condition_ = GetAttr<bool>("is_scalar_condition", attrs);
-    sub_block_ = GetAttr<framework::BlockDesc *>("sub_block", attrs);
-  }
-
-  const vector<GType *> Input() const { return input_; }
-
-  const vector<GType *> Cond() const { return cond_; }
-
-  GType *Output() const { return output_; }
-
-  Variable *OutputScope() const { return scope_; }
-
-  bool isScalarCondition() const { return is_scalar_condition_; }
-
-  framework::BlockDesc *getSubBlock() const { return sub_block_; }
-
- private:
-  vector<GType *> input_;
-  vector<GType *> cond_;
-  GType *output_;
-  Variable *scope_;
-  bool is_scalar_condition_;
-  framework::BlockDesc *sub_block_;
-};
-
-DECLARE_KERNEL(ConditionalBlock, ConditionalBlockParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONDITIONAL_BLOCK_OP
diff --git a/mobile/src/operators/kernel/conv_add_bn_kernel.h b/mobile/src/operators/kernel/conv_add_bn_kernel.h
deleted file mode 100644
index 757664eb536f871811964608c8ad709c416d126c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_add_bn_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddBNKernel
-    : public OpKernelBase<DeviceType, FusionConvAddBNParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddBNParam<DeviceType> &param);
-  bool Init(FusionConvAddBNParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h b/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
deleted file mode 100644
index 2174a6f12571abd0fe8dbe71d693d1a77493a531..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_add_bn_relu_kernel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddBNReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddBNReluParam<DeviceType> &param);
-  bool Init(FusionConvAddBNReluParam<DeviceType> *param);
-
- private:
-  bool could_use_faster_depthwise_conv_ = false;
-  bool use_gemm_add_bn_relu = false;
-  bool use_slidingwindow_add_bn_relu = false;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_add_kernel.h b/mobile/src/operators/kernel/conv_add_kernel.h
deleted file mode 100644
index fd3f279a7829a5803da6e08c0280435443425ad0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_add_kernel.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#pragma once
-
-#include <vector>
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-#include "common/common.h"
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/depthwise_conv3x3.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddKernel
-    : public OpKernelBase<DeviceType, FusionConvAddParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddParam<DeviceType> &param);
-  bool Init(FusionConvAddParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_add_relu_kernel.h b/mobile/src/operators/kernel/conv_add_relu_kernel.h
deleted file mode 100644
index 8cfc92ef19937650f1835e16eb26c1bf59f2d345..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_add_relu_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvAddReluKernel
-    : public OpKernelBase<DeviceType, FusionConvAddReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvAddReluParam<DeviceType> &param);
-  bool Init(FusionConvAddReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h b/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
deleted file mode 100644
index 63a86b56538a259b783a6a99536b6c5be15d915a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_bn_add_relu_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVBNADDRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvBNAddReluKernel
-    : public OpKernelBase<DeviceType, FusionConvBNAddReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvBNAddReluParam<DeviceType> &param);
-  bool Init(FusionConvBNAddReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_bn_kernel.h b/mobile/src/operators/kernel/conv_bn_kernel.h
deleted file mode 100644
index 1fb0d680cf4584e2433af254cca25bc52a3b9e03..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_bn_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVBN_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvBNKernel
-    : public OpKernelBase<DeviceType, FusionConvBNParam<DeviceType>> {
- public:
-  void Compute(const FusionConvBNParam<DeviceType> &param);
-  bool Init(FusionConvBNParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_bn_relu_kernel.h b/mobile/src/operators/kernel/conv_bn_relu_kernel.h
deleted file mode 100644
index aef735a524f49d7a62465c1f235b5b843c4117d1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_bn_relu_kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvBNReluKernel
-    : public OpKernelBase<DeviceType, FusionConvBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvBNReluParam<DeviceType> &param);
-  bool Init(FusionConvBNReluParam<DeviceType> *param);
-
- private:
-  bool use_gemm_bn_relu = false;
-  bool use_slidingwindow_bn_relu = false;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_kernel.h b/mobile/src/operators/kernel/conv_kernel.h
deleted file mode 100644
index cac498c36bd5debef0ff996cdf017355a2371a18..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_kernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvKernel : public OpKernelBase<DeviceType, ConvParam<DeviceType>> {
- public:
-  void Compute(const ConvParam<DeviceType> &param);
-  bool Init(ConvParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_relu_kernel.h b/mobile/src/operators/kernel/conv_relu_kernel.h
deleted file mode 100644
index 4fb2fe31716a9070d39132faf1dd0ff65ab6863e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_relu_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_CONVRELU_OP
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvReluKernel
-    : public OpKernelBase<DeviceType, FusionConvReluParam<DeviceType>> {
- public:
-  void Compute(const FusionConvReluParam<DeviceType> &param);
-  bool Init(FusionConvReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/conv_transpose_kernel.h b/mobile/src/operators/kernel/conv_transpose_kernel.h
deleted file mode 100644
index 6341a87d43fdb3a3ca63fadd90239bdf2a6921a8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/conv_transpose_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class ConvTransposeKernel
-    : public OpKernelBase<DeviceType, ConvTransposeParam<DeviceType>> {
- public:
-  void Compute(const ConvTransposeParam<DeviceType> &param);
-
-  bool Init(ConvTransposeParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PADDLE_MOBILE_DE_CONV_KERNEL_H
diff --git a/mobile/src/operators/kernel/crf_kernel.h b/mobile/src/operators/kernel/crf_kernel.h
deleted file mode 100644
index 1436aafc0603d4c7ba9ecae911f10bd8f297852a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/crf_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CRF_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class CrfKernel
-    : public framework::OpKernelBase<DeviceType, CrfParam<DeviceType>> {
- public:
-  void Compute(const CrfParam<DeviceType>& param);
-  bool Init(CrfParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_bn_kernel.h b/mobile/src/operators/kernel/deconv_add_bn_kernel.h
deleted file mode 100755
index 181367031c0be48666efeda3df4426da38c67d4f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/deconv_add_bn_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddBNKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddBNParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddBNParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddBNParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h b/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
deleted file mode 100755
index c63b4db050ade64903ff817b40900faaef65924d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/deconv_add_bn_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddBNReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddBNReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddBNReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_kernel.h b/mobile/src/operators/kernel/deconv_add_kernel.h
deleted file mode 100644
index 61170f95e2f38319a454eb18461a171347ffed7a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/deconv_add_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_add_relu_kernel.h b/mobile/src/operators/kernel/deconv_add_relu_kernel.h
deleted file mode 100644
index dc48272157f6e8a5cba4fd09f8acca1b54e90c12..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/deconv_add_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvAddReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvAddReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvAddReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvAddReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h b/mobile/src/operators/kernel/deconv_bn_relu_kernel.h
deleted file mode 100755
index 4ab0257b07e53149ff88c6a6ecca2dc77c0eb634..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/deconv_bn_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvBNReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvBNReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvBNReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/deconv_relu_kernel.h b/mobile/src/operators/kernel/deconv_relu_kernel.h
deleted file mode 100644
index bc85f1ffee19abe3941bd9d90fb8dfd04280ce14..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/deconv_relu_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DeconvReluKernel
-    : public OpKernelBase<DeviceType, FusionDeconvReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDeconvReluParam<DeviceType> &param);
-
-  bool Init(FusionDeconvReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/dequant_bn_kernel.h b/mobile/src/operators/kernel/dequant_bn_kernel.h
deleted file mode 100644
index cf759bf69cc959759d770b685cffaef25ac24386..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/dequant_bn_kernel.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef FUSION_DEQUANT_BN_OP
-DECLARE_KERNEL(FusionDequantBN, FusionDequantBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_BN_RELU_OP
-DECLARE_KERNEL(FusionDequantBNRelu, FusionDequantBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_OP
-DECLARE_KERNEL(FusionDequantAddBN, FusionDequantAddBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP
-DECLARE_KERNEL(FusionDequantAddBNRelu, FusionDequantAddBNParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-DECLARE_KERNEL(FusionDequantAddBNQuant, FusionDequantAddBNQuantParam);
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP
-DECLARE_KERNEL(FusionDequantAddBNReluQuant, FusionDequantAddBNQuantParam);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/dequantize_kernel.h b/mobile/src/operators/kernel/dequantize_kernel.h
deleted file mode 100644
index 6ba8ec88c52f20ccfcd30d5b9a217eaef658d507..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/dequantize_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEQUANT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DequantizeKernel
-    : public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
- public:
-  void Compute(const DequantizeParam<DeviceType> &param);
-  bool Init(DequantizeParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/detection_kernel.h b/mobile/src/operators/kernel/detection_kernel.h
deleted file mode 100644
index 89c8348d5bc9b1a2a9fb84de14a3f73e26734154..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/detection_kernel.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef ANCHOR_GENERATOR_OP
-template <typename Dtype>
-class AnchorGeneratorParam : public OpParam {
- public:
-  AnchorGeneratorParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Input", inputs, *scope);
-    output_anchors_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Anchors", outputs, *scope);
-    output_variances_ = OpParam::GetVarValue<framework::LoDTensor>(
-        "Variances", outputs, *scope);
-
-    anchor_sizes_ = OpParam::GetAttr<std::vector<float>>("anchor_sizes", attrs);
-    aspect_ratios_ =
-        OpParam::GetAttr<std::vector<float>>("aspect_ratios", attrs);
-    variances_ = OpParam::GetAttr<std::vector<float>>("variances", attrs);
-    stride_ = OpParam::GetAttr<std::vector<float>>("stride", attrs);
-    offset_ = OpParam::GetAttr<float>("offset", attrs);
-  }
-
- public:
-  // input
-  framework::Tensor *input_;
-  // outputs
-  framework::Tensor *output_anchors_;
-  framework::Tensor *output_variances_;
-
-  std::vector<float> anchor_sizes_;
-  std::vector<float> aspect_ratios_;
-  std::vector<float> variances_;
-  std::vector<float> stride_;
-  float offset_;
-};
-
-DECLARE_KERNEL(AnchorGenerator, AnchorGeneratorParam);
-#endif
-
-#ifdef PROPOSAL_OP
-template <typename Dtype>
-class ProposalParam : public OpParam {
- public:
-  ProposalParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    scores_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Scores", inputs, *scope);
-    bbox_deltas_ = OpParam::GetVarValue<framework::LoDTensor>("BboxDeltas",
-                                                              inputs, *scope);
-    im_info_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ImInfo", inputs, *scope);
-    anchors_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Anchors", inputs, *scope);
-    variances_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Variances", inputs, *scope);
-
-    rpn_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("RpnRois", outputs, *scope);
-    rpn_probs_ = OpParam::GetVarValue<framework::LoDTensor>("RpnRoiProbs",
-                                                            outputs, *scope);
-
-    pre_nms_topn_ = OpParam::GetAttr<int>("pre_nms_topN", attrs);
-    post_nms_topn_ = OpParam::GetAttr<int>("post_nms_topN", attrs);
-    nms_thresh_ = OpParam::GetAttr<float>("nms_thresh", attrs);
-    min_size_ = OpParam::GetAttr<float>("min_size", attrs);
-    eta_ = OpParam::GetAttr<float>("eta", attrs);
-  }
-
- public:
-  framework::Tensor *scores_;
-  framework::Tensor *bbox_deltas_;
-  framework::Tensor *im_info_;
-  framework::Tensor *anchors_;
-  framework::Tensor *variances_;
-
-  std::shared_ptr<Tensor> score_index_;
-
-  framework::LoDTensor *rpn_rois_;
-  framework::LoDTensor *rpn_probs_;
-
-  int pre_nms_topn_;
-  int post_nms_topn_;
-  float nms_thresh_;
-  float min_size_;
-  float eta_;
-#ifdef PADDLE_MOBILE_FPGA
-  std::shared_ptr<Tensor> float_score, float_bbox;
-  fpga::BypassArgs score_arg, bbox_arg;
-#endif
-};
-
-DECLARE_KERNEL(Proposal, ProposalParam);
-#endif
-
-#ifdef PSROI_POOL_OP
-template <typename Dtype>
-class PSRoiPoolParam : public OpParam {
- public:
-  PSRoiPoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-    input_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
-    output_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-
-    output_channels_ = OpParam::GetAttr<int>("output_channels", attrs);
-    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
-    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
-    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
-  }
-
- public:
-  framework::Tensor *input_x_;
-  framework::LoDTensor *input_rois_;
-  framework::Tensor *output_;
-  int output_channels_;
-  int pooled_height_;
-  int pooled_width_;
-  float spatial_scale_;
-#ifdef PADDLE_MOBILE_FPGA
-  std::shared_ptr<Tensor> float_input, float_output;
-  fpga::BypassArgs input_arg, output_arg;
-#endif
-};
-
-DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam);
-#endif
-
-#ifdef ROIALIGN_POOL_OP
-template <typename Dtype>
-class RoiAlignPoolParam : public OpParam {
- public:
-  RoiAlignPoolParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-    input_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
-    output_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-
-    pooled_height_ = OpParam::GetAttr<int>("pooled_height", attrs);
-    pooled_width_ = OpParam::GetAttr<int>("pooled_width", attrs);
-    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
-    sampling_ratio_ = OpParam::GetAttr<float>("sampling_ratio", attrs);
-  }
-
- public:
-  framework::Tensor *input_x_;
-  framework::LoDTensor *input_rois_;
-  framework::Tensor *output_;
-  int pooled_height_;
-  int pooled_width_;
-  float spatial_scale_;
-  int sampling_ratio_;
-#ifdef PADDLE_MOBILE_FPGA
-  std::shared_ptr<Tensor> float_input, float_output;
-  fpga::BypassArgs input_arg, output_arg;
-#endif
-};
-
-DECLARE_KERNEL(RoiAlignPool, RoiAlignPoolParam);
-#endif
-
-#ifdef ROI_PERSPECTIVE_OP
-template <typename Dtype>
-class RoiPerspectiveParam : public OpParam {
- public:
-  RoiPerspectiveParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-    input_rois_ =
-        OpParam::GetVarValue<framework::LoDTensor>("ROIs", inputs, *scope);
-    output_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-    transform_Matrix_ = OpParam::GetVarValue<framework::LoDTensor>(
-        "TransformMatrix", outputs, *scope);
-    mask = OpParam::GetVarValue<framework::LoDTensor>("Mask", outputs, *scope);
-
-    spatial_scale_ = OpParam::GetAttr<float>("spatial_scale", attrs);
-    transformed_height_ = OpParam::GetAttr<int>("transformed_height", attrs);
-    transformed_width_ = OpParam::GetAttr<int>("transformed_width", attrs);
-  }
-
- public:
-  framework::Tensor *input_x_;
-  framework::LoDTensor *input_rois_;
-  framework::Tensor *output_;
-  framework::Tensor *transform_Matrix_;
-  framework::Tensor *mask;
-
-  float spatial_scale_;
-  int transformed_height_;
-  int transformed_width_;
-};
-
-DECLARE_KERNEL(RoiPerspective, RoiPerspectiveParam);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/dropout_kernel.h b/mobile/src/operators/kernel/dropout_kernel.h
deleted file mode 100644
index 2f59d01b6723eea274b1ed059ae08863a4937961..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/dropout_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class DropoutKernel
-    : public framework::OpKernelBase<DeviceType, DropoutParam<DeviceType>> {
- public:
-  void Compute(const DropoutParam<DeviceType>& param);
-  bool Init(DropoutParam<DeviceType>* para);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h b/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
deleted file mode 100644
index 3bd8093adb539d8fc0f6ea4b400b9ff864e1b664..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/dwconv_bn_relu_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_DWCONVBNRELU_OP
-
-#include <vector>
-#include "framework/ddim.h"
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::DDim;
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class DWConvBNReluKernel
-    : public OpKernelBase<DeviceType, FusionDWConvBNReluParam<DeviceType>> {
- public:
-  void Compute(const FusionDWConvBNReluParam<DeviceType> &param);
-  bool Init(FusionDWConvBNReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_add_kernel.h b/mobile/src/operators/kernel/elementwise_add_kernel.h
deleted file mode 100644
index 8fa07e519ec0b78baffabd08fb7e524f8259c9eb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/elementwise_add_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ElementwiseAddKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseAddParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseAddParam<DeviceType> &param);
-  bool Init(ElementwiseAddParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h b/mobile/src/operators/kernel/elementwise_add_relu_kernel.h
deleted file mode 100644
index d18c4e27fa3345b1818d0e6149fc8fb83195f644..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/elementwise_add_relu_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class ElementwiseAddReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseAddReluParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseAddReluParam<DeviceType> &param);
-  bool Init(ElementwiseAddReluParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_mul_kernel.h b/mobile/src/operators/kernel/elementwise_mul_kernel.h
deleted file mode 100644
index f71b6257d5c978735198f1b42e15f3f454eb8787..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/elementwise_mul_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ElementwiseMulKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseMulParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseMulParam<DeviceType> &param);
-  bool Init(ElementwiseMulParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/elementwise_sub_kernel.h b/mobile/src/operators/kernel/elementwise_sub_kernel.h
deleted file mode 100644
index 89536b920837b57c4017ccadff7ea6e233cd999e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/elementwise_sub_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEADD_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/elementwise_op_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ElementwiseSubKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     ElementwiseSubParam<DeviceType>> {
- public:
-  void Compute(const ElementwiseSubParam<DeviceType> &param);
-  bool Init(ElementwiseSubParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/exp_kernel.h b/mobile/src/operators/kernel/exp_kernel.h
deleted file mode 100644
index ed7c4296f8750b0ec87569f73eb70c4aed3cbb54..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/exp_kernel.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef EXP_OP
-
-#include <operators/op_param.h>
-#include "framework/operator.h"
-namespace paddle_mobile {
-namespace operators {
-DECLARE_KERNEL(EXP, EXPParam)
-}
-}  // namespace paddle_mobile
-#endif  // EXP_OP
diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h
deleted file mode 100644
index 00c12a9372eeb533c03bcc038edeec01eff3f3bf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/expand_kernel.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef EXPAND_OP
-DECLARE_KERNEL(Expand, ExpandParam);
-#endif  // EXPAND_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fc_relu_kernel.h b/mobile/src/operators/kernel/fc_relu_kernel.h
deleted file mode 100644
index 6735a50bee86e25d9f8d091b6218a472f3838aec..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fc_relu_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FCRELU_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FusionFcReluKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     FusionFcReluParam<DeviceType>> {
- public:
-  void Compute(const FusionFcReluParam<DeviceType>& param);
-  bool Init(FusionFcReluParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/feed_kernel.h b/mobile/src/operators/kernel/feed_kernel.h
deleted file mode 100644
index 2f6fb6b31d9f9d29aa50104fe217869380cfb7ad..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/feed_kernel.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FeedKernel
-    : public framework::OpKernelBase<DeviceType, FeedParam<DeviceType>> {
- public:
-  void Compute(const FeedParam<DeviceType> &param);
-  bool Init(FeedParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fetch_kernel.h b/mobile/src/operators/kernel/fetch_kernel.h
deleted file mode 100644
index d9ed91855d0db5149cc8cf4f5d571afd1fbea98f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fetch_kernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class FetchKernel
-    : public framework::OpKernelBase<DeviceType, FetchParam<DeviceType>> {
- public:
-  void Compute(const FetchParam<DeviceType> &param);
-  bool Init(FetchParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/flatten2_kernel.h b/mobile/src/operators/kernel/flatten2_kernel.h
deleted file mode 100644
index 78b3e820e63012139ef857d882b7b0855fe54035..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/flatten2_kernel.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by hujie09 on 2019-07-31.
-//
-
-#ifdef FLATTEN2_OP
-#include <operators/op_param.h>
-#include "framework/operator.h"
-namespace paddle_mobile {
-namespace operators {
-DECLARE_KERNEL(Flatten2, FlattenParam)
-}
-}  // namespace paddle_mobile
-
-#endif  // FLATTEN2_KERNEL
diff --git a/mobile/src/operators/kernel/flatten_kernel.h b/mobile/src/operators/kernel/flatten_kernel.h
deleted file mode 100644
index 4846725bcb6522389d29e137980b9d53e63f9f32..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/flatten_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FLATTEN_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FlattenKernel
-    : public framework::OpKernelBase<DeviceType, FlattenParam<DeviceType>> {
- public:
-  void Compute(const FlattenParam<DeviceType>& param);
-  bool Init(FlattenParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
deleted file mode 100644
index 8debe5afac8e47388ba4f4281f8790a519fe283c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA>* param) {
-  // bool relu_enabled = false;
-  zynqmp::PE<ConvParam>& conv = param.context().convPE();
-  ConvParam& p = conv.param();
-  p.input = param->Input()->ZynqTensor();
-  p.filter = param->Filter()->ZynqTensor();
-
-  BatchnormParam* bn = new BatchnormParam();
-  p.bn = bn;
-
-  return true;
-}
-
-template <>
-void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA>& param) {
-  zynqmp::PE<ConvParam>& conv = param.context().convPE();
-  conv.dispatch();
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
deleted file mode 100644
index 0214f2231b10affca25f8afc11c2869cff53f2f1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  return true;
-}
-
-template <>
-void ConvAddKernel<FPGA, float>::Compute(
-    const FusionConvAddParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
deleted file mode 100644
index e0170a7de540f32beaaf8e64df81afad33b6d33f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
deleted file mode 100644
index a137c920c3347351d976a241b255c61f1aaffdb0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_bn_kernel.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/kernel/conv_bn_kernel.h"
-#include "fpga/KD/pes/conv_pe.hpp"
-
-using ConvPE = paddle_mobile::zynqmp::ConvPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA>* param) {
-  param->Output()->mutable_data<half>();
-
-  ConvPE& pe = param->context().pe<ConvPE>();
-  zynqmp::ConvParam& conv_param = pe.param();
-  zynqmp::BatchnormParam* bn_param = new zynqmp::BatchnormParam();
-  bn_param->bias = param->InputBias()->zynqmpTensor();
-  bn_param->scale = param->InputScale()->zynqmpTensor();
-  bn_param->mean = param->InputMean()->zynqmpTensor();
-  bn_param->variance = param->InputVariance()->zynqmpTensor();
-  bn_param->epsilon = param->Epsilon();
-  conv_param.input = param->Input()->zynqmpTensor();
-  conv_param.output = param->Output()->zynqmpTensor();
-  conv_param.filter = param->Filter()->zynqmpTensor();
-  conv_param.batchnorm = bn_param;
-  conv_param.relu.enabled = false;
-  conv_param.groups = param->Groups();
-  conv_param.strides = param->Strides();
-  conv_param.paddings = param->Paddings();
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA>& param) {
-  std::cout << "ConvBNKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  ConvPE& pe = context.pe<ConvPE>();
-  pe.dispatch();
-
-  std::string path =
-      "bn_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt";
-  // param.Output()->zynqmpTensor()->saveToFile(path);
-
-  // param.Output()->zynqmpTensor()->saveToFile();
-  std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
deleted file mode 100644
index 5b3b1deb1cce7aeaf02789706a449c3f8e1cb668..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include "fpga/KD/pes/conv_pe.hpp"
-
-#include <math.h>
-
-using ConvPE = paddle_mobile::zynqmp::ConvPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA>* param) {
-  param->Output()->mutable_data<half>();
-
-  ConvPE& pe = param->context().pe<ConvPE>();
-  zynqmp::ConvParam& conv_param = pe.param();
-  zynqmp::BatchnormParam* bn_param = new zynqmp::BatchnormParam();
-  bn_param->bias = param->InputBias()->zynqmpTensor();
-  bn_param->scale = param->InputScale()->zynqmpTensor();
-  bn_param->mean = param->InputMean()->zynqmpTensor();
-  bn_param->variance = param->InputVariance()->zynqmpTensor();
-  bn_param->epsilon = param->Epsilon();
-  conv_param.input = param->Input()->zynqmpTensor();
-  conv_param.output = param->Output()->zynqmpTensor();
-  conv_param.filter = param->Filter()->zynqmpTensor();
-  conv_param.batchnorm = bn_param;
-  conv_param.relu.enabled = true;
-  conv_param.groups = param->Groups();
-  conv_param.strides = param->Strides();
-  conv_param.paddings = param->Paddings();
-  pe.init();
-  pe.apply();
-  return true;
-}
-template <>
-void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA>& param) {
-  std::cout << "ConvBNReluKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  ConvPE& pe = context.pe<ConvPE>();
-  pe.dispatch();
-
-  std::string path =
-      "bnr_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt";
-  // param.Output()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0]
-            << std::endl;
-
-  if (isinf(param.Output()->zynqmpTensor()->scale()[0])) {
-    // zynqmp::ConvParam& conv_param = pe.param();
-    std::cout << "invalid cale !!!!!!!!!!!!" << std::endl;
-    // std::cout << conv_param.convArgs.conv_arg[0].kernel.width << std::endl;
-    exit(-1);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
deleted file mode 100644
index 52e95158c4d0a58106de6e5127fc62e9e2aee653..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/elementwise_add_relu_kernel.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-#include "fpga/KD/pes/elementwise_add_pe.hpp"
-
-using ElementwiseAddPE = paddle_mobile::zynqmp::ElementwiseAddPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddReluKernel<FPGA, float>::Init(
-    ElementwiseAddReluParam<FPGA>* param) {
-  param->Out()->mutable_data<half>();
-
-  ElementwiseAddPE& pe = param->context().pe<ElementwiseAddPE>();
-  zynqmp::ElementwiseAddParam& ew_param = pe.param();
-  ew_param.inputs = {
-      param->InputX()->zynqmpTensor(),
-      param->InputY()->zynqmpTensor(),
-  };
-  ew_param.output = param->Out()->zynqmpTensor();
-  ew_param.relu.enabled = true;
-
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA>& param) {
-  std::cout << "ElementwiseAddReluKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  ElementwiseAddPE& pe = context.pe<ElementwiseAddPE>();
-  pe.dispatch();
-
-  std::string path =
-      "ew_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt";
-  // param.Out()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
deleted file mode 100644
index 7a0450c59955087919f6592b459e972f3cbbcea0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/feed_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-#include "fpga/KD/pes/input_pe.hpp"
-
-using InputParam = paddle_mobile::zynqmp::InputParam;
-using InputPE = paddle_mobile::zynqmp::InputPE;
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA>* param) {
-  int col = param->Col();
-  auto input = const_cast<LoDTensor*>(&param->InputX()->at(col));
-
-  InputPE& pe = param->context().pe<InputPE>();
-  InputParam& input_param = pe.param();
-  input->mutable_data<float>();
-  zynqmp::Tensor* input_tensor = input->zynqmpTensor();
-  input_param.input = input_tensor;
-  param->Out()->mutable_data<half>();
-  auto out = param->Out()->zynqmpTensor();
-  input_param.output = out;
-  pe.init();
-
-  return true;
-}
-
-template <>
-void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA>& param) {
-  std::cout << "FeedKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  InputPE& pe = context.pe<InputPE>();
-
-  int col = param.Col();
-  auto input = const_cast<LoDTensor*>(&param.InputX()->at(col));
-  InputParam& input_param = pe.param();
-  input->mutable_data<float>();
-  zynqmp::Tensor* input_tensor = input->zynqmpTensor();
-  input_param.input = input_tensor;
-  param.Out()->Resize(input->dims());
-  param.Out()->mutable_data<half>();
-  auto out = param.Out()->zynqmpTensor();
-  input_param.output = out;
-  pe.dispatch();
-
-  param.Out()->zynqmpTensor()->saveToFile("feed_out.txt");
-}
-template class FeedKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
deleted file mode 100644
index 75b0e0ccf8875b0828805e6239722bffc08b0f60..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/fetch_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/fetch_kernel.h"
-#include "fpga/KD/pes/output_pe.hpp"
-
-namespace paddle_mobile {
-namespace operators {
-
-using OutputPE = zynqmp::OutputPE;
-
-template <>
-bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA>* param) {
-  auto input = param->InputX();
-  int col = param->Col();
-  auto output = &(param->Out()->at(col));
-  output->Resize(input->dims());
-  output->mutable_data<float>();
-
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param->context_);
-  OutputPE& pe = context.pe<OutputPE>();
-  zynqmp::OutputParam& out_param = pe.param();
-  out_param.input = input->zynqmpTensor();
-  out_param.output = output->zynqmpTensor();
-
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA>& param) {
-  std::cout << "FetchKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  OutputPE& pe = context.pe<OutputPE>();
-  pe.dispatch();
-
-  int col = param.Col();
-  auto output = &(param.Out()->at(col));
-  output->zynqmpTensor()->saveToFile("fetch_out.txt");
-}
-template class FetchKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
deleted file mode 100644
index 5b564fe4b6fc5c1a9ac29a4eb939212032d4a2f6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-#include "fpga/KD/pes/fully_connected_pe.hpp"
-
-namespace paddle_mobile {
-namespace operators {
-
-using FullyConnectedPE = zynqmp::FullyConnectedPE;
-
-template <>
-bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA>* param) {
-  param->Out()->mutable_data<half>();
-
-  FullyConnectedPE& pe = param->context().pe<FullyConnectedPE>();
-  zynqmp::FullyConnectedParam& fc_param = pe.param();
-  fc_param.input = param->InputX()->zynqmpTensor();
-  fc_param.output = param->Out()->zynqmpTensor();
-  fc_param.filter = param->InputY()->zynqmpTensor();
-  fc_param.bias = param->InputZ()->zynqmpTensor();
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA>& param) {
-  std::cout << "FusionFcKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  FullyConnectedPE& pe = context.pe<FullyConnectedPE>();
-  pe.dispatch();
-
-  param.Out()->zynqmpTensor()->invalidate();
-  std::string path =
-      "fc_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt";
-  param.Out()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
deleted file mode 100644
index 69db4472c99d2f68757e98214308833cdf47d5cc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/pool_kernel.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-#include "fpga/KD/pes/pooling_pe.hpp"
-
-class PoolingArgs;
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA>* param) {
-  param->Output()->mutable_data<half>();
-
-  zynqmp::PoolingPE& pe = param->context().pe<zynqmp::PoolingPE>();
-  zynqmp::PoolingParam& pool_param = pe.param();
-
-  pool_param.input = param->Input()->zynqmpTensor();
-  pool_param.output = param->Output()->zynqmpTensor();
-  pool_param.type = param->PoolingType() == "max"
-                        ? zynqmp::PoolingType::MAX
-                        : zynqmp::PoolingType::AVERAGE;
-  pool_param.globalPooling = param->isGlobalPooling();
-  pool_param.kernelSize = param->Ksize();
-  pool_param.strides = param->Strides();
-  pool_param.paddings = param->Paddings();
-
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA>& param) {
-  std::cout << "PoolKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  zynqmp::PoolingPE& pe = context.pe<zynqmp::PoolingPE>();
-  pe.dispatch();
-
-  std::string path =
-      "pool_" + std::to_string(param.Output()->zynqmpTensor()->id()) + ".txt";
-  param.Output()->zynqmpTensor()->saveToFile(path);
-  // param.Output()->zynqmpTensor()->saveToFile();
-  std::cout << "Out scale:" << param.Output()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
deleted file mode 100644
index dace88c5a21ad50533907ad2b97498dc1772324a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/KD/softmax_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-#include "fpga/KD/pes/softmax_pe.hpp"
-#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA>* param) {
-  param->Out()->mutable_data<half>();
-
-  zynqmp::SoftmaxPE& pe = param->context().pe<zynqmp::SoftmaxPE>();
-  zynqmp::SoftmaxParam& fc_param = pe.param();
-  fc_param.input = param->InputX()->zynqmpTensor();
-  fc_param.output = param->Out()->zynqmpTensor();
-  pe.init();
-  pe.apply();
-  return true;
-}
-
-template <>
-void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA>& param) {
-  std::cout << "SoftmaxKernel\n";
-  zynqmp::Context& context = const_cast<zynqmp::Context&>(param.context_);
-  zynqmp::SoftmaxPE& pe = context.pe<zynqmp::SoftmaxPE>();
-  pe.dispatch();
-
-  param.Out()->zynqmpTensor()->invalidate();
-  std::string path =
-      "softmax_" + std::to_string(param.Out()->zynqmpTensor()->id()) + ".txt";
-  param.Out()->zynqmpTensor()->saveToFile(path);
-  std::cout << "Out scale:" << param.Out()->zynqmpTensor()->scale()[0]
-            << std::endl;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
deleted file mode 100644
index 31872411f7a0862209c0017cf4cf98e7826abc03..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANCHOR_GENERATOR_OP
-#include <string.h>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AnchorGeneratorKernel<FPGA, float>::Init(
-    AnchorGeneratorParam<FPGA> *param) {
-  auto input = param->input_;
-  auto anchors = param->output_anchors_;
-  auto anchor_ptr = anchors->mutable_data<float>();
-  auto stride = param->stride_;
-  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
-  auto stride_width = stride[0], stride_height = stride[1];
-  auto offset = param->offset_;
-
-  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
-                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
-                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
-                          134, -204, -188, 220, 204, -281, -395, 296,  441};
-
-  int anchors_offset2[] = {-18, -31, 34,  47,  -22, -22, 38,  38,  -33,
-                           -44, 49,  60,  -2,  -2,  18,  18,  -10, -14,
-                           26,  30,  -14, -22, 30,  38,  -9,  -26, 25,
-                           42,  -92, -92, 108, 108, -2,  -15, 18,  31};
-
-  if (offset > 0.6) {
-    memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
-    std::cout << "anchor generator marker" << std::endl;
-  } else {
-    std::cout << "anchor generator rfcn" << std::endl;
-  }
-  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
-
-  //  DLOG << "feature_height: " << feature_height;
-  //  DLOG << "feature_width: " << feature_width;
-  //  DLOG << "num_anchors: " << num_anchors;
-  //  DLOG << "stride_width: " << stride_width;
-  //  DLOG << "stride_height: " << stride_height;
-
-  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-    int offset0 = h_idx * feature_width * num_anchors * 4;
-    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-      int offset1 = w_idx * num_anchors * 4;
-      for (int idx = 0; idx < num_anchors; idx++) {
-        int offset = offset0 + offset1 + idx * 4;
-        anchor_ptr[offset + 0] =
-            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
-        anchor_ptr[offset + 1] =
-            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
-        anchor_ptr[offset + 2] =
-            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
-        anchor_ptr[offset + 3] =
-            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-void AnchorGeneratorKernel<FPGA, float>::Compute(
-    const AnchorGeneratorParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ANCHOR_GENERATOR_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
deleted file mode 100644
index 7690f41ad3fbbebf59cd546a24370056eeb123d9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/concat_kernel.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
-  auto inputs = param->Inputs();
-  auto out = param->Out();
-  auto image_num = inputs.size();
-  auto images_in =
-      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
-  auto scales_in =
-      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
-  auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
-
-  auto height = inputs[0]->dims()[2];
-  auto width = inputs[0]->dims()[3];
-  for (int i = 0; i < image_num; i++) {
-    auto input = inputs[i];
-    PADDLE_MOBILE_ENFORCE(
-        input->dims()[2] == height && input->dims()[3] == width,
-        "Image height & width should be unified");
-    images_in[i] = input->data<half>();
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
-    scales_in[i] = input->scale;
-  }
-  fpga::format_concat_output(out, height, width, image_num, channel_num);
-
-  fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = image_num;
-  concatArgs.images_in = images_in;
-  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = out->data<half>();
-  concatArgs.scale_out = out->scale;
-  concatArgs.channel_num = channel_num;
-  concatArgs.height = height;
-  concatArgs.width = width;
-  param->SetFpgaArgs(concatArgs);
-  return true;
-}
-
-template <>
-void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  ComputeFPGAConcat(param.FpgaArgs());
-}
-template class ConcatKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
deleted file mode 100644
index c052805dfdc361965c4fc5068ab386367f087797..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-
-  auto out = param->Output();
-
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-
-  delete new_scale;
-  delete new_bias;
-
-  return true;
-}
-
-template <>
-void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
deleted file mode 100755
index a7a93de9baed8711a66665ac9510094811ca44d9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNReluKernel<FPGA, float>::Init(
-    FusionConvAddBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  vector<int> paddings = param->Paddings();
-  vector<int> strides = param->Strides();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-
-  const int groups = param->Groups();
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, strides[0], strides[1],
-                          paddings[0], paddings[1], new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-    fpga::fpga_free(new_scale_ptr);
-    fpga::fpga_free(bs_ptr);
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                         leaky_relu_negative_slope, param->Groups(), strides[0],
-                         strides[1], paddings[0], paddings[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-    delete new_scale;
-    delete new_bias;
-  }
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
deleted file mode 100644
index da16af58f117b2fbb0e4b6442f9496ea9b824317..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddKernel<FPGA, float>::Compute(
-    const FusionConvAddParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
deleted file mode 100644
index f1f61da4217d4ecf3ce12e75b9fba3d3447cb4f6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
deleted file mode 100644
index 54d99f22d185b0252ad4b5b5b48ceaa1e424b1c6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/kernel/conv_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
deleted file mode 100644
index 4ce8265f7f780d5ea4291783e309cd9507bf18b6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i];
-    bs_ptr[i] = new_bias_ptr[i];
-  }
-  const int groups = param->Groups();
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                         leaky_relu_negative_slope, param->Groups(),
-                         param->Strides()[0], param->Strides()[1],
-                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-template <>
-void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
deleted file mode 100644
index 57b5eb754e327160399bee728d0689101fac1134..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_kernel.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = 0;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable,
-                       leaky_relu_negative_slope, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
deleted file mode 100644
index 1597885e43e01895b6acd425031341af70d5eaf7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  // const Tensor *bias = param->Bias();
-  // auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-  //                      "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<FPGA, float>::Compute(
-    const ConvTransposeParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
deleted file mode 100644
index a8205df3c9c1052055ba15ca58fd215f1d49ba0e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#include "operators/kernel/deconv_add_bn_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
deleted file mode 100755
index b27f5cf870d2e3220bec31ee63bb27361cb2c8cf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#include "operators/kernel/deconv_add_bn_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNReluKernel<FPGA, float>::Init(
-    FusionDeconvAddBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
deleted file mode 100644
index 41844d008b2c8313fc8f1ac75a00d9864b5a20a5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#include "operators/kernel/deconv_add_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-
-  return true;
-}
-
-template <>
-void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
deleted file mode 100644
index c6fc9d195511ae3218450fa58393ba420444eb92..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#include "operators/kernel/deconv_add_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddReluKernel<FPGA, float>::Init(
-    FusionDeconvAddReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
deleted file mode 100644
index 75597f0ecd570b6b21894a2f9a0ff0ad91a54ea4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/deconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#include "operators/kernel/deconv_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvBNReluKernel<FPGA, float>::Init(
-    FusionDeconvBNReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-  }
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
-    bs_ptr[i] = new_bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter,
-                            activation_enable, leaky_relu_negative_slope,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable,
-                          leaky_relu_negative_slope, param->Groups(),
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void DeconvBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvBNReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
deleted file mode 100644
index 8b990d46e0b90bf67eaf36bbf38238fd4432ace6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/dropout_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
deleted file mode 100644
index db4d2afbc1735ac7f733a9dea673c748bc6edc29..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef ELEMENTWISEADD_OP
-
-#include "operators/kernel/elementwise_add_kernel.h"
-
-#include <string>
-#include "fpga/V1/api.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  if (input_y->type() != type_id<float>()) {
-    paddle_mobile::fpga::ActivationType activation_enable =
-        paddle_mobile::fpga::NONE;
-    int16_t leaky_relu_negative_slope = 0;
-    auto *input_x = const_cast<LoDTensor *>(param->InputX());
-    auto input_x_ptr = input_x->data<half>();
-    auto input_y_ptr = input_y->data<half>();
-    fpga::format_fp16_ofm(out);
-    auto out_ptr = out->mutable_data<half>();
-
-    fpga::EWAddArgs ewaddArgs = {0};
-    // ewaddArgs.relu_enabled = relu_enabled;
-    ewaddArgs.output.activation.activation_type = activation_enable;
-    ewaddArgs.output.activation.leaky_relu_negative_slope =
-        leaky_relu_negative_slope;
-    ewaddArgs.const0 = 0x3c00;  // =1
-    ewaddArgs.const1 = 0x3c00;  // =1
-    ewaddArgs.image0.address = input_x_ptr;
-    ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-    ewaddArgs.image0.scale_address = input_x->scale;
-    ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-    ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-    ewaddArgs.image0.pad_height = 0;
-    ewaddArgs.image0.pad_width = 0;
-    ewaddArgs.image1.address = input_y_ptr;
-    ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-    ewaddArgs.image1.scale_address = input_y->scale;
-    ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-    ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-    ewaddArgs.image1.pad_height = 0;
-    ewaddArgs.image1.pad_width = 0;
-    ewaddArgs.output.scale_address = out->scale;
-    ewaddArgs.output.address = out_ptr;
-    fpga::expand_EW_arg(&ewaddArgs);
-    param->SetFpgaArgs(ewaddArgs);
-  } else {
-    param->float_input_x.Resize(param->InputX()->dims());
-    param->float_input_x.init(type_id<float>().hash_code());
-    fpga::format_fp32_ofm(&(param->float_input_x));
-
-    param->float_out.Resize(param->InputX()->dims());
-    param->float_out.mutable_data<float>(param->InputX()->dims());
-    fpga::format_fp32_ofm(&(param->float_out));
-
-    fpga::format_fp16_ofm(out);
-  }
-  return true;
-}
-inline void ElementwiseAddCompute(const ElementwiseAddParam<FPGA> &param) {
-  auto input_x = param.float_input_x;
-  auto input_y = param.InputY();
-  auto Out = param.float_out;
-  int axis = param.Axis();
-
-  const auto &x_dims = input_x.dims();
-  const auto &y_dims = input_y->dims();
-  /// axis = -1 represent the last dimensions.
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  size_t batch = 1;
-  size_t channels = 1;
-  size_t elementwise_num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    elementwise_num *= x_dims[i];
-  }
-  const float *bias_data = input_y->data<float>();
-  const float *input_data = input_x.data<float>();
-  float *output_data = Out.mutable_data<float>();
-
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      size_t offset = (i * channels + j) * elementwise_num;
-      const float *input = input_data + offset;
-      const float bias = bias_data[j];
-      float *output = output_data + offset;
-      // DLOG << "output address: "<< output;
-      for (int k = 0; k < elementwise_num; ++k) {
-        output[k] = input[k] + bias;
-        // DLOG << "output[" << k << "]= " << output[k] ;
-      }
-    }
-  }
-}
-template <>
-void ElementwiseAddKernel<FPGA, float>::Compute(
-    const ElementwiseAddParam<FPGA> &param) {
-  auto input_y = const_cast<LoDTensor *>(param.InputY());
-  if (input_y->type() != type_id<float>()) {
-    fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-  } else {
-    auto input_x = const_cast<LoDTensor *>(param.InputX());
-    auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP32;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = input_x->data<half>();
-    args.image.channels = (uint32_t)(input_x->fpga_data_num);
-    args.image.height = 1;
-    args.image.width = 1;
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = intput_x_float->data<float>();
-    args.output.scale_address = intput_x_float->scale;
-
-    // fpga::fpga_flush(input_x->data<half>(),input_x->fpga_data_num *
-    // sizeof(half));
-    fpga::PerformBypass(args);
-    fpga::fpga_invalidate(args.output.address,
-                          input_x->fpga_data_num * sizeof(float));
-
-    // just for test
-    /*    {
-           static int cnt = 0;
-           if(cnt == 0){
-               std::string str= "first_bypass_data";
-               float rslt = 0.0f;
-               fpga::savefile(str, args.output.address, input_x->fpga_data_num,
-       rslt); cnt++;
-           }
-       }*/
-    ElementwiseAddCompute(param);
-
-    auto out_float = const_cast<Tensor *>(&(param.float_out));
-    DLOG << "out float: " << out_float->data<float>();
-    fpga::fpga_flush(out_float->data<float>(),
-                     input_x->fpga_data_num * sizeof(float));
-    // just for test
-    /*{
-       static int cnt = 0;
-       if(cnt == 0){
-           std::string str= "ew_output_data";
-           float rslt = 0.0f;
-
-           fpga::savefile(str, out_float->data<float>(), input_x->fpga_data_num,
-   rslt); cnt++;
-       }
-   }*/
-    auto Out = param.Out();
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = out_float->data<float>();
-    args.image.channels = (uint32_t)(input_x->fpga_data_num);
-    args.image.height = 1;
-    args.image.width = 1;
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = Out->data<half>();
-    args.output.scale_address = Out->scale;
-    fpga::PerformBypass(args);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
deleted file mode 100644
index f36206a8a15451144a00a16aad176ca67c4a4114..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddReluKernel<FPGA, float>::Init(
-    ElementwiseAddReluParam<FPGA> *param) {
-  // bool relu_enabled = true;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  auto input_x_ptr = input_x->data<half>();
-  auto input_y_ptr = input_y->data<half>();
-  fpga::format_fp16_ofm(out);
-  auto out_ptr = out->mutable_data<half>();
-
-  fpga::EWAddArgs ewaddArgs = {0};
-  // ewaddArgs.relu_enabled = relu_enabled;
-  ewaddArgs.output.activation.activation_type = activation_enable;
-  ewaddArgs.output.activation.leaky_relu_negative_slope =
-      leaky_relu_negative_slope;
-  ewaddArgs.const0 = 0x3c00;  // =1
-  ewaddArgs.const1 = 0x3c00;  // =1
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  fpga::expand_EW_arg(&ewaddArgs);
-  param->SetFpgaArgs(ewaddArgs);
-  return true;
-}
-
-template <>
-void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
deleted file mode 100644
index d744ae2c07810ae89418641799a37ea978d14139..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include "operators/math/elementwise_op_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline T operator()(T a, T b) const { return a * b; }
-};
-template <>
-bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
-  param->float_input_x.Resize(param->InputX()->dims());
-  param->float_input_x.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_input_x));
-
-  param->float_out.Resize(param->InputX()->dims());
-  param->float_out.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_out));
-
-  auto *out = param->Out();
-  fpga::format_fp16_ofm(out);
-  return true;
-}
-
-template <>
-void ElementwiseMulKernel<FPGA, float>::Compute(
-    const ElementwiseMulParam<FPGA> &param) {
-  auto input_x = const_cast<LoDTensor *>(param.InputX());
-  auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
-  // auto intput_x_32_ptr =
-  // const_cast<float*>(param.float_input_x.data<float>());
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input_x->data<half>();
-  args.image.channels = (uint32_t)(input_x->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = intput_x_float->data<float>();
-  args.output.scale_address = intput_x_float->scale;
-  fpga::PerformBypass(args);
-  fpga::fpga_invalidate(args.output.address,
-                        input_x->fpga_data_num * sizeof(float));
-
-  auto input_y = param.InputY();
-  int axis = param.Axis();
-  auto out_float = const_cast<Tensor *>(&(param.float_out));
-  ElementwiseComputeEx<MulFunctor<float>, float>(
-      intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
-  fpga::fpga_flush(out_float->data<float>(),
-                   input_x->fpga_data_num * sizeof(float));
-
-  Tensor *Out = param.Out();
-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = out_float->data<float>();
-  args.image.channels = (uint32_t)(Out->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = Out->data<half>();
-  args.output.scale_address = Out->scale;
-  fpga::PerformBypass(args);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
deleted file mode 100644
index 28559b2b4bb96404febf5cf65a75e264166df20f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  auto output = param->Out();
-  int col = param->Col();
-  DLOG << "col = " << col;
-  auto input = const_cast<LoDTensor *>(&param->InputX()->at(col));
-  input->init(type_id<float>().hash_code());
-  input->Resize(output->dims());
-
-  if (output->dims().size() != 4) {
-    return true;
-  }
-
-  fpga::format_fp16_ofm(output);
-  return true;
-}
-
-template <>
-void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto output = param.Out();
-  int col = param.Col();
-  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
-  kTypeId_t input_type = input->type();
-
-  if (input_type == type_id<float>()) {
-    input->init(type_id<float>().hash_code());
-  } else {
-    input->init(type_id<int8_t>().hash_code());
-  }
-  input->Resize(output->dims());
-
-  if (output->dims().size() != 4) {
-    size_t size = output->numel() * sizeof(float);
-    auto output_ptr = output->data<float>();
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-    memcpy(output_ptr, p_data, size);
-    input->external_data = nullptr;
-    return;
-  }
-
-  fpga::format_image(input);
-  auto output_ptr = output->data<half>();
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
-  if (input_type == type_id<float>()) {
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_FP32;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  } else {
-    auto input_ptr = input->data<int8_t>();
-    auto external_ptr = reinterpret_cast<int8_t *>(input->external_data);
-    int8_t *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-
-    args.input_data_type = fpga::DATA_TYPE_INT8;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.input_layout_type = fpga::LAYOUT_CHW;
-    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = p_data;
-    args.image.channels = (uint32_t)input->dims()[1];
-    args.image.height = (uint32_t)input->dims()[2];
-    args.image.width = (uint32_t)input->dims()[3];
-    args.image.pad_height = 0;
-    args.image.pad_width = 0;
-    args.output.address = output_ptr;
-    args.output.scale_address = output->scale;
-    fpga::PerformBypass(args);
-    input->external_data = nullptr;
-  }
-}
-template class FeedKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
deleted file mode 100644
index 87ede2af1ab2fa3225c0cd3e75c3fe0c8c8fb509..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/fetch_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  int col = param->Col();
-  DLOG << "col = " << col;
-  auto output = &(param->Out()->at(col));
-  if (input->type() == type_id<float>()) {
-    return true;
-  }
-  output->init(type_id<float>().hash_code());
-  output->Resize(input->dims());
-  fpga::format_fp32_ofm(output);
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (alignedCW != unalignedCW) {
-    param->aligned_out.Resize(input->dims());
-    param->aligned_out.mutable_data<float>(input->dims());
-    fpga::fpga_flush(param->aligned_out.data<float>(),
-                     outH * unalignedCW * sizeof(float));
-  }
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input->data<half>();
-  args.image.channels = (uint32_t)(input->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = output->data<float>();
-  args.output.scale_address = output->scale;
-  param->fpga_bypass_args = args;
-
-  return true;
-}
-void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
-  int alignCW = paddle_mobile::fpga::align_to_x(input_c * input_w, 16);
-  int dealignCW = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * alignCW;
-    auto output_offset = h * dealignCW;
-    memcpy((dst + output_offset), (src + input_offset),
-           dealignCW * sizeof(float));
-  }
-}
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  int col = param.Col();
-  auto output = &param.Out()->at(col);
-  if (input->type() == type_id<float>()) {
-    output->ShareDataWith(*input);
-    return;
-  }
-
-  fpga::BypassArgs args = param.fpga_bypass_args;
-  auto input_address = (input->data<half>());
-  args.image.address = static_cast<void *>(input_address);
-  float *outdata_ptr =
-      reinterpret_cast<float *>(param.fpga_bypass_args.output.address);
-  const int num_th = 32;
-  if (output->fpga_data_num < num_th) {
-    fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half));
-
-    for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]);
-    }
-    return;
-  }
-
-  fpga::PerformBypass(args);
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-
-  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
-                        output->fpga_data_num * sizeof(float));
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (unalignedCW != alignedCW) {
-    auto aligned_ptr = const_cast<float *>(param.aligned_out.data<float>());
-    dealign(outdata_ptr, aligned_ptr, outC, outH, outW);
-    memcpy(outdata_ptr, aligned_ptr, outC * outH * outW * sizeof(float));
-    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-  }
-}
-template class FetchKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
deleted file mode 100644
index 3a29104d0fe0e3c69c9369fb1137b2c94ef04e43..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-
-  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-  //                     "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
-                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
deleted file mode 100644
index fef370515e9e9ffa1d90c184e62919235533b8a5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
-  // bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-
-  // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
-  //                      "Image channel should be equal to weight number");
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = 1;
-    bs_ptr[i] = input_z_ptr[i];
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_fp16_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
-                       leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
deleted file mode 100644
index 370b34e86387d9ee52c95556baf5ade6d19ff1e7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/pad2d_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool Pad2DKernel<FPGA, float>::Init(Pad2DParam<FPGA> *param) {
-  Tensor *output = param->Out();
-  fpga::format_fp16_ofm(output);
-  return true;
-}
-void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
-  auto input_data = (input->data<half>());
-  auto output_data = (output->data<half>());
-  auto input_c = input->dims()[1];
-  auto input_h = input->dims()[2];
-  auto input_w = input->dims()[3];
-  auto output_c = output->dims()[1];
-  auto output_w = output->dims()[3];
-  auto copysize = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * input_c * input_w;
-    auto output_offset = h * paddle_mobile::fpga::align_to_x(
-                                 output_c * output_w, IMAGE_ALIGNMENT);
-    memcpy((output_data + output_offset), (input_data + input_offset),
-           copysize * sizeof(half));
-  }
-}
-template <>
-void Pad2DKernel<FPGA, float>::Compute(const Pad2DParam<FPGA> &param) {
-  auto in_x = param.InputX();
-  auto out = param.Out();
-  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
-                        in_x->numel() * sizeof(half));
-  pad2dFunc(in_x, out);
-  (out->scale)[0] = (in_x->scale)[0];
-  (out->scale)[1] = (in_x->scale)[1];
-  DLOG << (out->scale)[0];
-  DLOG << (out->scale)[1];
-  size_t outputSize =
-      out->dims()[2] *
-      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
-                                      IMAGE_ALIGNMENT) *
-      sizeof(half);
-  fpga::fpga_flush(out->data<half>(), outputSize);
-}
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
deleted file mode 100644
index 7c8dba1696ecc15ba9748aabf1973445d23de95c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/pool_kernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-
-class PoolingArgs;
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<LoDTensor *>(param->Input());
-  auto *output = param->Output();
-  vector<int> ksize = param->Ksize();
-  vector<int> strides = param->Strides();
-  vector<int> paddings = param->Paddings();
-  std::string pooling_type = param->PoolingType();
-
-  if (input->type() == type_id<float>()) {
-    int channels = input->dims()[1];
-    int height = input->dims()[2];
-    int width = input->dims()[3];
-    int num = input->dims()[0];
-    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
-    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
-    framework::DDim dim =
-        framework::make_ddim({num, channels, out_height, out_width});
-    output->mutable_data<float>(dim);
-    return true;
-  }
-
-  auto input_ptr = input->data<half>();
-  fpga::format_fp16_ofm(output);
-  auto output_ptr = output->mutable_data<half>();
-
-  fpga::PoolingArgs poolArgs = {0};
-  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
-  poolArgs.kernel_reciprocal =
-      fpga::fp32_2_fp16(float(1.0 / (ksize[0] * ksize[1])));  // NOLINT
-  poolArgs.image.address = input_ptr;
-  poolArgs.image.channels = (uint32_t)input->dims()[1];
-  poolArgs.image.height = (uint32_t)input->dims()[2];
-  poolArgs.image.width = (uint32_t)input->dims()[3];
-  poolArgs.image.pad_height = (uint32_t)paddings[0];
-  poolArgs.image.pad_width = (uint32_t)paddings[1];
-  poolArgs.image.scale_address = input->scale;
-  poolArgs.output.address = output_ptr;
-  poolArgs.output.scale_address = output->scale;
-  poolArgs.kernel.height = (uint32_t)ksize[0];
-  poolArgs.kernel.width = (uint32_t)ksize[1];
-  poolArgs.kernel.stride_h = (uint32_t)strides[0];
-  poolArgs.kernel.stride_w = (uint32_t)strides[1];
-  param->SetFpgaArgs(poolArgs);
-  return true;
-}
-
-template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
-  auto *input = const_cast<LoDTensor *>(param.Input());
-
-  if (input->type() == type_id<float>()) {
-    auto *output = param.Output();
-    auto in = input->data<float>();
-    auto N = input->dims()[0];
-    output->Resize(
-        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
-    auto len = output->numel();
-    auto out = output->mutable_data<float>();
-    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
-        W = input->dims()[3];
-    int HW = H * W, CHW = C * H * W, WC = W * C;
-
-    for (int n = 0; n < N; n++) {
-      for (int c = 0; c < C; c++) {
-        out[n * C + c] = 0;
-        for (int h = 0; h < H; h++) {
-          for (int w = 0; w < W; w++) {
-            out[n * C + c] += in[n * CHW + h * WC + w * C +
-                                 c];  // in[n * CHW + c * HW + h * W + w]; //
-          }
-        }
-        out[n * C + c] /= HW;
-      }
-    }
-    return;
-  }
-  fpga::ComputeFpgaPool(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
deleted file mode 100644
index bd6703bb81f1a4b70f2a3406b312160116ad38f5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/proposal_kernel.cpp
+++ /dev/null
@@ -1,567 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PROPOSAL_OP
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-template <>
-bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
-  int post_nms_top_n = param->post_nms_topn_;
-  int64_t batch = param->scores_->dims()[0];
-  auto total = post_nms_top_n * batch;
-  param->rpn_rois_->mutable_data<float>({total, 4});
-  param->rpn_probs_->mutable_data<float>({total, 1});
-
-  //  DLOG << *param->rpn_rois_;
-  //  DLOG << *param->rpn_probs_;
-
-  param->float_bbox = std::make_shared<Tensor>();
-  param->float_bbox->Resize(param->bbox_deltas_->dims());
-  param->float_bbox->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_bbox.get());
-  param->float_score = std::make_shared<Tensor>();
-  param->float_score->Resize(param->scores_->dims());
-  param->float_score->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_score.get());
-
-  auto input = param->bbox_deltas_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_bbox->mutable_data<float>();
-  args.output.scale_address = param->float_bbox->scale;
-  param->bbox_arg = args;
-
-  input = param->scores_;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_score->mutable_data<float>();
-  args.output.scale_address = param->float_score->scale;
-  param->score_arg = args;
-
-  param->score_index_ = std::make_shared<Tensor>();
-  param->score_index_->mutable_data<int32_t>({input->numel()});
-  auto score_index = param->score_index_->data<int32_t>();
-  for (int i = 0; i < input->numel(); ++i) {
-    score_index[i] = i;
-  }
-
-  return true;
-}
-template <typename T>
-void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
-  PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
-                            (index.dims().size() == 2 && index.dims()[1] == 1),
-                        "Dim not correct");
-  int64_t index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  const T *p_src = src.data<T>();
-  const int *p_index = index.data<int>();
-  T *p_output = output->data<T>();
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = dst->data<void>();
-  auto *to_add_data = src.data<void>();
-  size_t size_of_t = framework::SizeOfType(src.type());
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data, src.numel() * size_of_t);
-}
-
-template <class T>
-static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
-                            Tensor *variances, Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>();
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-  const T *variances_data = nullptr;
-  if (variances) {
-    variances_data = variances->data<T>();
-  }
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    /*
-        if (variances) {
-          bbox_center_x =
-              variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width
-       + anchor_center_x; bbox_center_y = variances_data[i * len + 1] *
-                              bbox_deltas_data[i * len + 1] * anchor_height +
-                          anchor_center_y;
-          bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                                bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                                 bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-        } else {
-    */
-    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-    bbox_center_y =
-        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-
-    /*
-          bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                            kBBoxClipDefault)) *
-                       anchor_width;
-          bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                             kBBoxClipDefault)) *
-                        anchor_height;
-    */
-    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
-    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
-    //    }
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    /*
-        //wong
-        proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-        proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-        //wong
-    */
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
-  }
-  // return proposals;
-}
-
-template <class T>
-static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>();
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static inline void FilterBoxes(Tensor *boxes, float min_size,
-                               const Tensor &im_info, Tensor *keep) {
-  const T *im_info_data = im_info.data<T>();
-  T *boxes_data = boxes->mutable_data<T>();
-  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0]});
-  min_size = std::max(min_size, 1.0f);
-  int *keep_data = keep->mutable_data<int>();
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-template <class T>
-static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static inline T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <typename T>
-static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                                    int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  auto *keep_data = keep_nms.mutable_data<T>();
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
-                         float eta, int post_nms_num = 100) {
-  int64_t num_boxes = bbox->dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox->dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices =
-      GetSortedScoreIndex<T>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
-template <typename T>
-std::pair<Tensor, Tensor> ProposalForOneImage(
-    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
-    const Tensor &bbox_deltas_slice,  // [M, 4]
-    const Tensor &scores_slice,       // [N, 1]
-    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
-    float nms_thresh, float min_size, float eta) {
-  auto *scores_data = scores_slice.data<T>();
-
-  // Sort index
-  Tensor index_t;
-  index_t.Resize({scores_slice.numel()});
-  int *index = index_t.mutable_data<int>();
-  /*for (int i = 0; i < scores_slice.numel(); ++i) {
-    index[i] = i;
-  }*/
-  std::memcpy(index, score_index.data<int32_t>(),
-              scores_slice.numel() * sizeof(int));
-
-  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
-    return scores_data[i] > scores_data[j];
-  };
-
-  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
-    std::sort(index, index + scores_slice.numel(), compare);
-  } else {
-    std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
-                     compare);
-    index_t.Resize({pre_nms_top_n});
-  }
-
-  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.mutable_data<T>({index_t.numel(), 1});
-  bbox_sel.mutable_data<T>({index_t.numel(), 4});
-  anchor_sel.mutable_data<T>({index_t.numel(), 4});
-  var_sel.mutable_data<T>({index_t.numel(), 4});
-
-  CPUGather<T>(scores_slice, index_t, &scores_sel);
-  CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
-  CPUGather<T>(anchors, index_t, &anchor_sel);
-  Tensor proposals;
-  proposals.mutable_data<T>({index_t.numel(), 4});
-  BoxCoder<T>(&anchor_sel, &bbox_sel, nullptr, &proposals);
-
-  ClipTiledBoxes<T>(im_info_slice, &proposals);
-
-  Tensor keep;
-  FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
-
-  Tensor scores_filter;
-  bbox_sel.mutable_data<T>({keep.numel(), 4});
-  scores_filter.mutable_data<T>({keep.numel(), 1});
-
-  CPUGather<T>(proposals, keep, &bbox_sel);
-  CPUGather<T>(scores_sel, keep, &scores_filter);
-  if (nms_thresh <= 0) {
-    return std::make_pair(bbox_sel, scores_filter);
-  }
-
-  // Tensor keep_nms = NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta);
-  Tensor keep_nms =
-      NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
-
-  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize({post_nms_top_n});
-  }
-
-  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
-  scores_sel.mutable_data<T>({keep_nms.numel(), 1});  // original
-
-  // proposals.mutable_data<T>({post_nms_top_n, 4});   // wong
-  // scores_sel.mutable_data<T>({post_nms_top_n, 1});  // wong
-  CPUGather<T>(bbox_sel, keep_nms, &proposals);
-  CPUGather<T>(scores_filter, keep_nms, &scores_sel);
-  return std::make_pair(proposals, scores_sel);
-}
-
-template <>
-void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
-  auto input_score = param.scores_;
-  auto input_score_data = input_score->data<half>();
-  auto input_score_data_tmp = input_score->data<half>();
-  uint32_t score_n, score_height, score_width, score_channels;
-
-  auto input_bbox = param.bbox_deltas_;
-  auto input_bbox_data = input_bbox->data<half>();
-  auto input_bbox_data_tmp = input_bbox->data<half>();
-  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
-
-  score_n = (uint32_t)(input_score->dims()[0]);
-  score_channels = (uint32_t)(input_score->dims()[1]);
-  score_height = (uint32_t)(input_score->dims()[2]);
-  score_width = (uint32_t)(input_score->dims()[3]);
-
-  bbox_n = (uint32_t)(input_bbox->dims()[0]);
-  bbox_channels = (uint32_t)(input_bbox->dims()[1]);
-  bbox_height = (uint32_t)(input_bbox->dims()[2]);
-  bbox_width = (uint32_t)(input_bbox->dims()[3]);
-
-  std::shared_ptr<Tensor> score_tmp = std::make_shared<Tensor>();
-  score_tmp->Resize(param.scores_->dims());
-  score_tmp->mutable_data<half>();
-
-  std::shared_ptr<Tensor> bbox_tmp = std::make_shared<Tensor>();
-  bbox_tmp->Resize(param.bbox_deltas_->dims());
-  bbox_tmp->mutable_data<half>();
-
-  auto score_tmp_data = score_tmp->data<half>();
-  auto bbox_tmp_data = bbox_tmp->data<half>();
-  int64_t amount_per_side = score_width * score_height;
-  int idx = 0;
-  fpga::fpga_invalidate(
-      input_score_data_tmp,
-      score_height * score_width * score_channels * sizeof(half));
-  for (int h = 0; h < score_height; h++) {
-    for (int w = 0; w < score_width; w++) {
-      for (int c = 0; c < score_channels; c++) {
-        idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
-        *(score_tmp_data + c * amount_per_side + score_width * h + w) =
-            (*(input_score_data_tmp++));
-      }
-    }
-  }
-  amount_per_side = bbox_width * bbox_height;
-  fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width *
-                                                 bbox_channels * sizeof(half));
-  for (int h = 0; h < bbox_height; h++) {
-    for (int w = 0; w < bbox_width; w++) {
-      for (int c = 0; c < bbox_channels; c++) {
-        idx++;
-        // DLOG  << "wong input_score: "<<
-        // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]);
-        *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) =
-            (*(input_bbox_data_tmp++));
-      }
-    }
-  }
-  struct paddle_mobile::fpga::BypassArgs temp_score_arg;
-  struct paddle_mobile::fpga::BypassArgs temp_bbox_arg;
-  temp_score_arg = param.score_arg;
-  temp_score_arg.image.address = score_tmp->data<half>();
-
-  temp_bbox_arg = param.bbox_arg;
-  temp_bbox_arg.image.address = bbox_tmp->data<half>();
-  auto score_tensor = param.float_score.get();
-  fpga::PerformBypass(param.score_arg);
-  fpga::fpga_invalidate(score_tensor->data<float>(),
-                        score_tensor->numel() * sizeof(float));
-
-  auto bbox_tensor = param.float_bbox.get();
-  fpga::PerformBypass(param.bbox_arg);
-  fpga::fpga_invalidate(bbox_tensor->data<float>(),
-                        bbox_tensor->numel() * sizeof(float));
-
-  auto *scores = param.float_score.get();
-  auto *bbox_deltas = param.float_bbox.get();
-  auto *im_info = param.im_info_;
-  auto anchors = *param.anchors_;
-  auto variances = *param.variances_;
-
-  auto *rpn_rois = param.rpn_rois_;
-  auto *rpn_roi_probs = param.rpn_probs_;
-
-  auto score_index = *(param.score_index_.get());
-
-  int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
-  // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_;
-
-  float nms_thresh = param.nms_thresh_ / 2.0f;
-  float min_size = param.min_size_;
-  float eta = param.eta_;
-
-  auto &scores_dim = scores->dims();
-  int64_t num = scores_dim[0];
-  int64_t c_score = scores_dim[1];
-  int64_t h_score = scores_dim[2];
-  int64_t w_score = scores_dim[3];
-
-  auto &bbox_dim = bbox_deltas->dims();
-  int64_t c_bbox = bbox_dim[1];
-  int64_t h_bbox = bbox_dim[2];
-  int64_t w_bbox = bbox_dim[3];
-
-  //
-  rpn_rois->mutable_data<float>({bbox_deltas->numel(), 4});
-  rpn_roi_probs->mutable_data<float>({scores->numel(), 1});
-
-  framework::LoD lod;
-  lod.resize(1);
-  auto &lod0 = lod[0];
-  lod0.push_back(0);
-  anchors.Resize({anchors.numel(), 4});
-  variances.Resize({variances.numel(), 4});
-
-  int64_t num_proposals = 0;
-  for (int64_t i = 0; i < num; ++i) {
-    Tensor im_info_slice = im_info->Slice(i, i + 1);
-    Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
-    Tensor scores_slice = (*score_tensor).Slice(i, i + 1);
-
-    bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4});
-    scores_slice.Resize({h_score * w_score * c_score, 1});
-
-    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
-        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
-        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
-    Tensor &proposals = tensor_pair.first;
-    Tensor &scores = tensor_pair.second;
-
-    AppendProposals(rpn_rois, 4 * num_proposals, proposals);
-    AppendProposals(rpn_roi_probs, num_proposals, scores);
-    num_proposals += proposals.dims()[0];
-    lod0.push_back(num_proposals);
-  }
-  rpn_rois->set_lod(lod);
-  rpn_roi_probs->set_lod(lod);
-  rpn_rois->Resize({num_proposals, 4});
-  rpn_roi_probs->Resize({num_proposals, 1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PROPOSAL_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
deleted file mode 100644
index 7e0852ca4b25ff3ffea31136cea0065495d57dc6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp
+++ /dev/null
@@ -1,284 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V1/api.h"
-#include "fpga/V1/image.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-  // param->float_output = std::make_shared<Tensor>();
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-  // fpga::format_fp16_ofm(param->output_);
-
-  param->output_->mutable_data<float>(dims_out_new);
-  //  auto output = param->float_output.get();
-  // param->output_ = output;
-  /* args.input_data_type = fpga::DATA_TYPE_FP32;
-   args.output_data_type = fpga::DATA_TYPE_FP16;
-   args.image.address = output->data<float>();
-   args.image.height = (uint32_t)output->dims()[2];
-   args.image.width = (uint32_t)output->dims()[3];
-   args.image.channels = (uint32_t)output->dims()[1]  ;
-   args.output.address = param->output_->mutable_data<half>();
-   args.output.scale_address = param->output_->scale;
-   param->output_arg = args;*/
-
-  return true;
-}
-
-/*
-    template <typename Dtype>
-    void PSROIPoolingForward(
-    const Dtype* bottom_data,
-    const int height, const int width, const int input_channel,
-    Dtype* top_data,
-    const int pooled_height, const int pooled_width, const int output_channel,
-    const Dtype* bottom_rois,
-    const Dtype Bin_size_h, const Dtype Bin_size_w, const Dtype roi_start_h,
-   const Dtype roi_start_w, const int pw, const int ph, const int roi_batch_ind)
-    {
-
-      int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-      int wstart = floor(static_cast<Dtype>(pw)* Bin_size_w + roi_start_w);
-      int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-      int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-      hstart = std::min(std::max(hstart, 0), height);
-      hend = std::min(std::max(hend, 0), height);
-      wstart = std::min(std::max(wstart, 0), width);
-      wend = std::min(std::max(wend, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      float32x4_t sum_pixels_low_c= vdupq_n_f32(0);
-      float32x4_t sum_pixels_high_c= vdupq_n_f32(0);
-
-      if(!is_empty){
-          Dtype bin_area = (hend - hstart) * (wend - wstart);
-          float rev_bin_area = 1 / bin_area;
-          float32x4_t q_bin_area = vdupq_n_f32(rev_bin_area);
-   //static_cast<float>(bin_area) float pixels_c[output_channel];
-
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-                int pixel_offset = (h * width + w) * input_channel;
-                for(int output_c = 0; output_c < output_channel; output_c++){
-                    int input_channel_offset = output_c * pooled_height *
-   pooled_width; int input_bias = pixel_offset + input_channel_offset + ph *
-   pooled_width + pw; pixels_c[output_c] = bottom_data[input_bias];
-                }
-                float32x4_t pixel_low_c = vld1q_f32(pixels_c);
-                float32x4_t pixel_high_c = vld1q_f32(pixels_c + 4);
-                sum_pixels_low_c = vaddq_f32(sum_pixels_low_c, pixel_low_c);
-                sum_pixels_high_c = vaddq_f32(sum_pixels_high_c, pixel_high_c);
-            }
-          }
-          sum_pixels_low_c = vmulq_f32(sum_pixels_low_c, q_bin_area);
-          sum_pixels_high_c = vmulq_f32(sum_pixels_high_c, q_bin_area);
-        }
-
-      int output_index_base = (ph * pooled_width + pw) * output_channel;
-      top_data += output_index_base;
-      vst1q_f32(top_data, sum_pixels_low_c);
-      top_data += 4;
-      vst1q_f32(top_data, sum_pixels_high_c);
-    }*/
-
-template <typename Dtype>
-void PSROIPoolingForward(const Dtype* bottom_data, const int height,
-                         const int width, const int input_channel,
-                         Dtype* top_data, const int pooled_height,
-                         const int pooled_width, const int output_channel,
-                         const Dtype* bottom_rois, const Dtype Bin_size_h,
-                         const Dtype Bin_size_w, const Dtype roi_start_h,
-                         const Dtype roi_start_w, const int pw, const int ph,
-                         const int roi_batch_ind) {
-  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-  // Add roi offsets and clip to input boundaries
-  hstart = std::min(std::max(hstart, 0), height);
-  hend = std::min(std::max(hend, 0), height);
-  wstart = std::min(std::max(wstart, 0), width);
-  wend = std::min(std::max(wend, 0), width);
-  bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-  float sum_pixels_c[output_channel] = {0};
-  float pixels_c[output_channel] = {0};
-  if (!is_empty) {
-    Dtype bin_area = (hend - hstart) * (wend - wstart);
-    float rec_bin_area = 1 / bin_area;
-
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int pixel_offset = (h * width + w) * input_channel;
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          int input_channel_offset = output_c * pooled_height * pooled_width;
-          int input_bias =
-              pixel_offset + input_channel_offset + ph * pooled_width + pw;
-          pixels_c[output_c] = bottom_data[input_bias];
-        }
-
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          sum_pixels_c[output_c] += pixels_c[output_c];
-        }
-      }
-    }
-    for (int output_c = 0; output_c < output_channel; output_c++) {
-      sum_pixels_c[output_c] *= rec_bin_area;
-    }
-  }
-
-  int output_index_base = (ph * pooled_width + pw) * output_channel;
-  top_data += output_index_base;
-  memcpy(top_data, sum_pixels_c, output_channel * 4);
-}
-
-template <>
-void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto output_channels = param.output_channels_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  //  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-
-  (param.output_)->Resize(dims_out_new);
-
-  const float* input_data = data_nhwc;  // in->data<float>();
-  framework::Tensor rois_batch_id_list;
-  rois_batch_id_list.Resize({rois_num});
-  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
-
-  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
-
-  auto rois_lod = rois->lod().back();
-  int rois_batch_size = rois_lod.size() - 1;
-  PADDLE_MOBILE_ENFORCE(
-      rois_batch_size == batch_size,
-      "the rois_batch_size and input(X) batch_size should be the same.");
-  int rois_num_with_lod = rois_lod[rois_batch_size];
-  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
-                        "the rois_num from input and lod must be the same");
-
-  PADDLE_MOBILE_ENFORCE(
-      input_channels == output_channels * pooled_height * pooled_width,
-      "the channels of input X should equal the product of "
-      "output_channels x pooled_height x pooled_width");
-
-  // calculate batch id index for each roi according to LoD
-  for (int n = 0; n < rois_batch_size; ++n) {
-    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-      rois_batch_id_data[i] = n;
-    }
-  }
-  auto output_data = out->mutable_data<float>();
-  auto input_rois = rois->data<float>();
-
-  for (int n = 0; n < rois_num; ++n) {
-    auto offset_input_rois = input_rois + n * 4;
-    auto offset_output_data =
-        output_data + pooled_height * pooled_width * output_channels * n;
-
-    auto roi_start_w =
-        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
-    auto roi_start_h =
-        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
-    auto roi_end_w =
-        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    auto roi_end_h =
-        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small rois to be 1 x 1
-    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
-    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
-
-    // Compute bin size w and h at input feature map
-    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
-    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
-
-    int roi_batch_ind = rois_batch_id_data[n];
-
-    for (int ph = 0; ph < pooled_height; ph++) {
-      for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(input_data, height, width, input_channels,
-                                   offset_output_data, pooled_height,
-                                   pooled_width, output_channels, input_rois,
-                                   bin_size_h, bin_size_w, roi_start_h,
-                                   roi_start_w, pw, ph, roi_batch_ind);
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
deleted file mode 100644
index 75dda4bf6d0d233466ed818a6a2064efb7731ce3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/relu_kernel.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReluKernel<FPGA, float>::Init(ReluParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void ReluKernel<FPGA, float>::Compute(const ReluParam<FPGA> &param) {
-  PADDLE_MOBILE_ENFORCE(0, "relu as a single op is wrong");
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
deleted file mode 100644
index 647ecb5a6501371c74c8762cf81cee206f1dca68..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/reshape2_kernel.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto output = param->Out();
-  auto shape = param->Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  output->set_type(input->type());
-  fpga::format_ofm(output);
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-
-  return true;
-}
-
-void reshape(LoDTensor *input, LoDTensor *output) {
-  // Subscript r means after reshape
-
-  auto input_ptr = input->data<half>();
-  auto output_ptr = output->data<half>();
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  auto C = static_cast<int>(input->dims()[1]);
-  auto H = static_cast<int>(input->dims()[2]);
-  auto W = static_cast<int>(input->dims()[3]);
-  auto Cr = static_cast<int>(output->dims()[1]);
-  auto Hr = static_cast<int>(output->dims()[2]);
-  auto Wr = static_cast<int>(output->dims()[3]);
-  PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
-  auto WC = W * C;
-  auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
-  auto HW = H * W;
-  auto WCr = Wr * Cr;
-  auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
-  auto HWr = Hr * Wr;
-
-  fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
-
-  int offset_align = 0;
-  int offset_r = 0, offset_align_r = 0;
-  int cr = 0, hr = 0, wr = 0;
-
-  for (int h = 0; h < H; h++) {
-    int offset0 = h * WC_align;
-    for (int w = 0; w < W; w++) {
-      int offset1 = w * C + offset0;
-      for (int c = 0; c < C; c++) {
-        offset_align = offset1 + c;
-        offset_r = c * HW + h * W + w;
-        cr = offset_r / HWr;
-        hr = offset_r % HWr / Wr;
-        wr = offset_r % Wr;
-        offset_align_r = hr * WCr_align + wr * Cr + cr;
-        output_ptr[offset_align_r] = input_ptr[offset_align];
-      }
-    }
-  }
-
-  fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
-}
-
-template <>
-void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  auto output = param.Out();
-  auto shape = param.Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  if (output->dims() == input->dims()) {
-    DLOG << "No need to reshape";
-    output->ShareDataWith(*input);
-    framework::LoD lod = input->lod();
-    output->set_lod(lod);
-    return;
-  }
-
-  reshape(input, output);
-  //
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
deleted file mode 100644
index 5e01bb74bab6996ca59632ae31f37ecfeafc918c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/reshape_kernel.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  const int in_n = param->InputX()->dims()[0];
-  const int in_c = param->InputX()->dims()[1];
-  const int in_h = param->InputX()->dims()[2];
-  const int in_w = param->InputX()->dims()[3];
-  auto out = param->Out();
-  out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w}));
-  return true;
-}
-
-template <>
-void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
deleted file mode 100644
index ec8d19db800742693516e08215ccd3889ec86c37..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/roialign_pool_kernel.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ROIALIGN_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V1/api.h"
-#include "fpga/V1/image.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-
-  param->output_->mutable_data<float>(dims_out_new);
-
-  return true;
-}
-
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* bottom_data,
-                     const T& spatial_scale, const int channels,
-                     const int height, const int width, const int pooled_height,
-                     const int pooled_width, const int sampling_ratio,
-                     const T* bottom_rois, T* top_data) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    // roi could have 4 or 5 columns
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = 0;
-    // if (roi_cols == 5) {
-    // roi_batch_ind = offset_bottom_rois[0];
-    // offset_bottom_rois++;
-    // }
-
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_bottom_data =
-          bottom_data + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
-                            pc.w2 * offset_bottom_data[pc.pos2] +
-                            pc.w3 * offset_bottom_data[pc.pos3] +
-                            pc.w4 * offset_bottom_data[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          top_data[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <>
-void RoiAlignPoolKernel<FPGA, float>::Compute(
-    const RoiAlignPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto sampe_ratio = param.sampling_ratio_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-  (param.output_)->Resize(dims_out_new);
-
-  const int index = input_channels * pooled_height * pooled_width * rois_num;
-  auto rois_data = rois->data<float>();
-  auto top_data = param.output_->mutable_data<float>();
-  for (int i = 0; i < index; ++i) {
-    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
-                           height, width, pooled_height, pooled_width,
-                           sampe_ratio, rois_data, top_data);
-  }
-
-  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
-                              pooled_width, rois_num);
-  out->reset_data_ptr(top_data);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ROIALIGN_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
deleted file mode 100644
index 8fa6feda7f338baf29c8f5e2d484f2e0114242a7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SIGMOID_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::SIGMOID;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<half>();
-  auto out = param->Out();
-  fpga::format_fp16_ofm(out);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.image.address = input_ptr;
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.channels = input->fpga_data_num;
-  args.output.address = out->data<half>();
-  args.output.scale_address = out->scale;
-  args.output.activation.activation_type = activation_enable;
-  args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
-  param->SetFpgaArgs(args);
-  return true;
-}
-template <>
-void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
-  fpga::PerformBypass(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
deleted file mode 100644
index 2fd6ef542e72ab3aceedee22f79bd591a00b7712..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/slice_kernel.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/kernel/slice_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
-  auto output = param->output_;
-  fpga::format_fp16_ofm(output);
-  DLOG << "input: " << param->input_;
-  DLOG << "output: " << param->output_;
-  if (param->input_->type() != type_id<half>()) {
-    DLOG << "wrong type";
-  }
-  return true;
-}
-template <>
-void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
-  // Only support slicing in channel dimension
-  // Only support half data
-  // W must be aligned to 16
-
-  auto input = param.input_;
-  auto output = param.output_;
-  int HW = input->dims()[2] * input->dims()[3];
-  int channel = input->dims()[1];
-  auto input_ptr = input->data<half>();
-  auto output_ptr = output->data<half>();
-
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  int start = param.starts_[0], end = param.ends_[0];
-  start = start < 0 ? start + channel : start;
-  end = end < 0 ? end + channel : end;
-  start = start > channel ? channel : start;
-  end = end > channel ? channel : end;
-  int len = end - start;
-  size_t size = len * sizeof(half);
-
-  for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
-  }
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
deleted file mode 100644
index ac7a7bdc77c291864aad55ebb33495d8e1c57b50..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/softmax_kernel.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto dims = framework::vectorize(input->dims());
-  half *input_ptr;
-  auto out = param->Out();
-  if (input->type() == type_id<float>()) {
-    out->Resize(framework::make_ddim(dims));
-    out->mutable_data<float>(framework::make_ddim(dims));
-  } else {
-    input_ptr = input->data<half>();
-  }
-
-  auto float_input = new LoDTensor;
-
-  int input_n = 1, input_c = 1, input_h = 1, input_w = 1;
-  if (dims.size() == 4) {
-    input_h = dims[1];
-    input_w = dims[2];
-    input_c = dims[3];
-    if (input_c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op");
-      input_c = dims[1];
-      input_h = 1;
-    }
-  } else if (dims.size() == 2) {
-    input_c = dims[1];
-  }
-  input->Resize(framework::make_ddim(dims));
-  float_input->Resize(framework::make_ddim(dims));
-
-  if (input_c == 2 && input->type() == type_id<half>()) {  // Use FPGA
-    fpga::format_fp16_ofm(out);
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.image.address = input_ptr;
-    args.image.height = input_h;
-    args.image.width = input_w;
-    args.image.channels = input_c;
-    args.output.address = out->data<half>();
-    args.output.scale_address = out->scale;
-    args.output.activation.activation_type = fpga::SOFTMAX;
-    param->SetFpgaArgs(args);
-  } else {  // Use CPU
-    out->Resize(framework::make_ddim(dims));
-    out->mutable_data<float>(framework::make_ddim(dims));
-    float_input->init(type_id<float>().hash_code());
-    float_input->mutable_data<float>(framework::make_ddim(dims));
-    fpga::format_fp32_ofm(float_input);
-    fpga::format_fp32_ofm(out);
-
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP32;
-    args.image.address = input_ptr;
-    args.image.height = input_h;
-    args.image.width = input_w;
-    args.image.channels = input_c;
-    args.output.address = float_input->data<float>();
-    args.output.scale_address = float_input->scale;
-    param->SetFloatInput(float_input);
-    param->SetFpgaArgs(args);
-  }
-
-  return true;
-}
-
-template <>
-void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  auto *in_x = (param.InputX());
-  auto dims = in_x->dims();
-  auto n = 1;
-  auto h = 1;
-  auto w = 1;
-  auto c = 1;
-  if (dims.size() == 4) {
-    h = dims[1];
-    w = dims[2];
-    c = dims[3];
-    if (c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op");
-      c = dims[1];
-      h = 1;
-    }
-  } else if (dims.size() == 2) {
-    c = dims[1];
-  }
-  if (in_x->type() == type_id<half>()) {
-    fpga::PerformBypass(param.FpgaArgs());
-    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-      Tensor *out = param.Out();
-      Tensor *in_x2 = param.FloatInput();
-
-      fpga::fpga_invalidate(in_x2->data<float>(),
-                            in_x2->numel() * sizeof(float));
-      math::SoftmaxFuntor<CPU, float>()(in_x2, out);
-      fpga::fpga_flush(out->data<float>(), out->memory_size());
-    }
-  } else {
-    if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) {
-      Tensor *out = param.Out();
-      out->Resize({n, h, w, c});
-      math::SoftmaxFuntor<CPU, float>()(in_x, out);
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
deleted file mode 100644
index 584cb41fb30b02c757430bd748d4672cc870b591..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
-  auto *in = const_cast<LoDTensor *>(param->InputX());
-  auto outs = param->Outs();
-  auto sections = param->Sections();
-  int axis = param->Axis();
-  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
-  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
-                        "Output number should be equal to section number");
-  auto image_num = (uint32_t)outs.size();
-  auto images_out =
-      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
-  auto scales_out = reinterpret_cast<float **>(
-      fpga::fpga_malloc(image_num * sizeof(float *)));
-  auto out_channels = reinterpret_cast<uint32_t *>(
-      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
-  DLOG << "input: " << in;
-  for (int i = 0; i < image_num; i++) {
-    fpga::format_fp16_ofm(outs[i]);
-    DLOG << "output: " << outs[i];
-    images_out[i] = outs[i]->mutable_data<half>();
-    scales_out[i] = outs[i]->scale;
-    out_channels[i] = (uint32_t)sections[i];
-  }
-
-  auto deleter = [](void *p) { fpga::fpga_free(p); };
-
-  fpga::SplitArgs arg = {0};
-  arg.image_num = image_num;
-  arg.image_in = in->data<half>();
-  arg.scale_in = in->scale;
-  arg.images_out = images_out;
-  arg.scales_out = scales_out;
-  arg.out_channel_nums = out_channels;
-  arg.height = (uint32_t)in->dims()[2];
-  arg.width = (uint32_t)in->dims()[3];
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
-
-  param->SetFpgaArgs(arg);
-  return true;
-}
-template <>
-void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
-  fpga::ComputeFPGASplit(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
deleted file mode 100644
index d7bbc5f0435aaca53be01d6c82d919a2df072ce2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/tanh_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TANH_OP
-
-#include "operators/kernel/tanh_kernel.h"
-#include <math.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  DLOG << "input: " << input;
-  auto input_ptr = input->data<half>();
-  auto float_input = new LoDTensor;
-
-  float_input->mutable_data<float>(
-      {1, input->dims()[1], input->dims()[2], input->dims()[3]});
-  fpga::format_fp32_ofm(float_input);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_CHW;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input_ptr;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->data<float>();
-  args.output.scale_address = float_input->scale;
-  param->SetFloatInput(float_input);
-  param->SetFpgaArgs(args);
-  return true;
-}
-
-#define EXP_MAX_INPUT 40.0
-template <typename T>
-T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-template <typename T>
-void tanhFuntor(Tensor *input, Tensor *output) {
-  auto *input_ptr = input->data<T>();
-  auto *output_ptr = output->mutable_data<T>();
-  for (int i = 0; i < input->numel(); i++) {
-    *(output_ptr + i) = Tanh<T>(*(input_ptr + i));
-  }
-}
-template <>
-void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {
-  Tensor *in_x = param.FloatInput();
-  Tensor *out = param.Out();
-
-  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate((void *)in_x->data<float>(),
-                        in_x->numel() * sizeof(float));
-  tanhFuntor<float>(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
deleted file mode 100644
index cc839a971ee7f827f150ecdfff0bd75e2a8aafe2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
-  auto input = param->InputX();
-  auto output = param->Out();
-  auto axis = param->Axis();
-  auto dim = input->dims();
-  output->ShareDataWith(*input);
-
-  auto dim_v = vectorize(dim);
-
-  for (int i = 0; i < axis.size(); i++) {
-    dim_v[i] = dim[axis[i]];
-  }
-  output->Resize(framework::make_ddim(dim_v));
-
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-  return true;
-}
-
-template <>
-void Transpose2Kernel<FPGA, float>::Compute(
-    const Transpose2Param<FPGA> &param) {
-  // Transpose2Compute<float>(param);
-  auto input = param.InputX();
-  auto output = param.Out();
-
-  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
-                  output->dims()[3]});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
deleted file mode 100755
index 56cc8927f035b16963b639bc960b20532b931f44..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ANCHOR_GENERATOR_OP
-
-#include <string.h>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool AnchorGeneratorKernel<FPGA, float>::Init(
-    AnchorGeneratorParam<FPGA> *param) {
-  auto input = param->input_;
-  auto anchors = param->output_anchors_;
-  auto anchor_ptr = anchors->mutable_data<float>();
-  auto stride = param->stride_;
-  auto feature_width = input->dims()[3], feature_height = input->dims()[2];
-  auto stride_width = stride[0], stride_height = stride[1];
-  auto offset = param->offset_;
-
-  int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
-                          -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
-                          79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
-                          134, -204, -188, 220, 204, -281, -395, 296,  411};
-
-  int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
-                           0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
-                           0, 0, 34, 68, 0, 0, 24, 28, 0, 0, 19, 46};
-
-  if (offset > 0.6) {
-    memcpy(anchors_offset, anchors_offset2, sizeof(anchors_offset));
-    DLOG << "anchor generator marker";
-  } else {
-    DLOG << "anchor generator rfcn";
-  }
-  int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4);
-
-  //  DLOG << "feature_height: " << feature_height;
-  //  DLOG << "feature_width: " << feature_width;
-  //  DLOG << "num_anchors: " << num_anchors;
-  //  DLOG << "stride_width: " << stride_width;
-  //  DLOG << "stride_height: " << stride_height;
-
-  for (int h_idx = 0; h_idx < feature_height; ++h_idx) {
-    int offset0 = h_idx * feature_width * num_anchors * 4;
-    for (int w_idx = 0; w_idx < feature_width; ++w_idx) {
-      int offset1 = w_idx * num_anchors * 4;
-      for (int idx = 0; idx < num_anchors; idx++) {
-        int offset = offset0 + offset1 + idx * 4;
-        anchor_ptr[offset + 0] =
-            anchors_offset[idx * 4 + 0] + w_idx * stride_width;
-        anchor_ptr[offset + 1] =
-            anchors_offset[idx * 4 + 1] + h_idx * stride_height;
-        anchor_ptr[offset + 2] =
-            anchors_offset[idx * 4 + 2] + w_idx * stride_width;
-        anchor_ptr[offset + 3] =
-            anchors_offset[idx * 4 + 3] + h_idx * stride_height;
-      }
-    }
-  }
-  return true;
-}
-
-template <>
-void AnchorGeneratorKernel<FPGA, float>::Compute(
-    const AnchorGeneratorParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ANCHOR_GENERATOR_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
deleted file mode 100755
index 8442eef8b2314d5035d673c12dd87590cfb8064d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONCAT_OP
-
-#include "operators/kernel/concat_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
-  auto inputs = param->Inputs();
-  auto out = param->Out();
-  auto image_num = inputs.size();
-  auto images_in =
-      (int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *));  // NOLINT
-  auto scales_in =
-      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
-  auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
-
-  auto height = inputs[0]->dims()[2];
-  auto width = inputs[0]->dims()[3];
-  for (int i = 0; i < image_num; i++) {
-    auto input = inputs[i];
-    PADDLE_MOBILE_ENFORCE(
-        input->dims()[2] == height && input->dims()[3] == width,
-        "Image height & width should be unified");
-    images_in[i] = input->data<int8_t>();
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
-    scales_in[i] = input->scale;
-  }
-  fpga::format_concat_output(out, height, width, image_num, channel_num);
-
-  fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = image_num;
-  concatArgs.images_in = images_in;
-  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = out->data<int8_t>();
-  concatArgs.scale_out = out->scale;
-  concatArgs.channel_num = channel_num;
-  concatArgs.height = height;
-  concatArgs.width = width;
-
-  auto deleter = [](void *p) { fpga::fpga_free(p); };
-  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(concatArgs.images_in), deleter));
-  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(concatArgs.scales_in), deleter));
-  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
-      reinterpret_cast<char *>(concatArgs.channel_num), deleter));
-
-  param->SetFpgaArgs(concatArgs);
-  return true;
-}
-
-template <>
-void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) {
-  ComputeFPGAConcat(param.FpgaArgs());
-}
-template class ConcatKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
deleted file mode 100644
index 2e4a8871fcde650790b45b824899b0fe828ac236..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBN_OP
-
-#include "operators/kernel/conv_add_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
-  bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                       param->Groups(), param->Strides()[0],
-                       param->Strides()[1], param->Paddings()[0],
-                       param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-
-  delete new_scale;
-  delete new_bias;
-
-  return true;
-}
-
-template <>
-void ConvAddBNKernel<FPGA, float>::Compute(
-    const FusionConvAddBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 8c65ee0627f2810a198dabdcbca286725595d798..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDBNRELU_OP
-
-#include "operators/kernel/conv_add_bn_relu_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvAddBNReluKernel<FPGA, float>::Init(
-    FusionConvAddBNReluParam<FPGA> *param) {
-  bool relu_enabled = true;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  const int groups = param->Groups();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  vector<int> paddings = param->Paddings();
-  vector<int> strides = param->Strides();
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0] &&
-                            bias->dims()[0] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] =
-        bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-    if (groups == channel) {
-      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
-      new_bias_ptr[i] = new_bias_ptr[i] * 127.0f / So;
-    }
-  }
-
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, relu_enabled,
-                          strides[0], strides[1], paddings[0], paddings[1],
-                          new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-    fpga::fpga_free(bs_ptr);
-    delete new_scale;
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,
-                         param->Groups(), strides[0], strides[1], paddings[0],
-                         paddings[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-    delete new_scale;
-    delete new_bias;
-  }
-
-  return true;
-}
-
-template <>
-void ConvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionConvAddBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
deleted file mode 100644
index d0a08abdda76013999a85d512185a919f5d849eb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADD_OP
-
-#include "operators/kernel/conv_add_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0;
-    bs_ptr[i] = bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddKernel<FPGA, float>::Compute(
-    const FusionConvAddParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
deleted file mode 100644
index 508e835b67640859ba5bebfd99c7b70c079ab24c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVADDRELU_OP
-
-#include "operators/kernel/conv_add_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0;
-    bs_ptr[i] = bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvAddReluKernel<FPGA, float>::Compute(
-    const FusionConvAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
deleted file mode 100644
index d3de98705e3562aaa77c093910f064837755748d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBN_OP
-
-#include "operators/kernel/conv_bn_kernel.h"
-#include <cmath>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void ConvBNKernel<FPGA, float>::Compute(const FusionConvBNParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
deleted file mode 100644
index 9ea962c111a1d7c0b71bd6d2c478a6b0cc2e2e96..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_CONVBNRELU_OP
-
-#include "operators/kernel/conv_bn_relu_kernel.h"
-#include <cmath>
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  const int groups = param->Groups();
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
-                        "Output channel should be equal to bias number");
-  const int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-    bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
-    bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-    if (groups == channel) {
-      new_scale_ptr[i] = new_scale_ptr[i] * Si / So;
-      new_bias_ptr[i] = new_bias_ptr[i] * 127.0 / So;
-    }
-  }
-  if (groups == channel) {
-    fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr);
-    fpga::DWconvArgs dwconv_arg = {0};
-    fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, true,
-                          param->Strides()[0], param->Strides()[1],
-                          param->Paddings()[0], param->Paddings()[1],
-                          new_bias_ptr);
-    param->SetFpgaArgs(dwconv_arg);
-    fpga::fpga_free(bs_ptr);
-  } else {
-    fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-    fpga::SplitConvArgs conv_arg = {0};
-    fpga::fill_split_arg(&conv_arg, input, out, filter, true, param->Groups(),
-                         param->Strides()[0], param->Strides()[1],
-                         param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(conv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void ConvBNReluKernel<FPGA, float>::Compute(
-    const FusionConvBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWConv(param.FpgaDwconvArgs());
-  } else {
-    fpga::ComputeFpgaConv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
deleted file mode 100644
index 9a003543d5d39605f36b76dcc9aa73df1d89f6a5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_kernel.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#include "operators/kernel/conv_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  int channel = out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0;
-    bs_ptr[i] = 0;
-  }
-
-  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input, out, filter, false, param->Groups(),
-                       param->Strides()[0], param->Strides()[1],
-                       param->Paddings()[0], param->Paddings()[1], bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void ConvKernel<FPGA, float>::Compute(const ConvParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
deleted file mode 100644
index c09e1ced8a4d1759054d14068a5515fa7b678017..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/conv_transpose_kernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_TRANSPOSE_OP
-
-#include "operators/kernel/conv_transpose_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = 0;
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = 0;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = 0;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void ConvTransposeKernel<FPGA, float>::Compute(
-    const ConvTransposeParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
deleted file mode 100644
index 1dcb5d7d41447a2e334b4543b095fa6cd3179fab..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_kernel.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBN_OP
-
-#include "operators/kernel/deconv_add_bn_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNParam<FPGA> &param) {
-  // fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
deleted file mode 100644
index 4c8b4ec3c2535a1372d6ab0fa6fa9663eef39269..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDBNRELU_OP
-
-#include "operators/kernel/deconv_add_bn_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddBNReluKernel<FPGA, float>::Init(
-    FusionDeconvAddBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
deleted file mode 100644
index 179d58ac9909bf2c2c5b9bdb31afa9b723af632a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_kernel.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADD_OP
-
-#include "operators/kernel/deconv_add_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  for (int i = 0; i < channel * sub_conv_n; i++) {
-    bs_ptr[i + sub_conv_n * channel] = 1;
-    bs_ptr[i] = bias_ptr[i % (channel)];
-  }
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, false,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, false,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-
-  return true;
-}
-
-template <>
-void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
deleted file mode 100644
index c7e728a169aa173f8c1f94bae6c6bb9d3a5b1efb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_add_relu_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVADDRELU_OP
-
-#include "operators/kernel/deconv_add_relu_kernel.h"
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvAddReluKernel<FPGA, float>::Init(
-    FusionDeconvAddReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->Bias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
-      bs_ptr[i] = bias_ptr[i % (channel)] * 127.0f / So;
-    }
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  return true;
-}
-
-template <>
-void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
deleted file mode 100644
index 081087b7ada033799dccfdbcbb2908d6810bddf9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/deconv_bn_relu_kernel.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_DECONVBNRELU_OP
-
-#include "operators/kernel/deconv_bn_relu_kernel.h"
-#include <cmath>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DeconvBNReluKernel<FPGA, float>::Init(
-    FusionDeconvBNReluParam<FPGA> *param) {
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input = const_cast<LoDTensor *>(param->Input());
-  const Tensor *bias = param->InputBias();
-  auto bias_ptr = bias->data<float>();
-  auto filter = const_cast<LoDTensor *>(param->Filter());
-  auto out = param->Output();
-  float Si = input->scale[0];
-  float So = out->scale[0];
-  float Sf = fpga::filter_find_max(filter);
-  auto bn_mean_ptr = param->InputMean()->data<float>();
-  auto bn_var_ptr = param->InputVariance()->data<float>();
-  auto bn_scale_ptr = param->InputScale()->data<float>();
-  auto bn_bias_ptr = param->InputBias()->data<float>();
-  const float epsilon = param->Epsilon();
-
-  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
-                        "Output channel should be equal to bias number");
-  int channel = out->dims()[1];
-  auto new_scale = new Tensor();
-  auto new_bias = new Tensor();
-  auto new_scale_ptr = new_scale->mutable_data<float>({channel});
-  auto new_bias_ptr = new_bias->mutable_data<float>({channel});
-  for (int i = 0; i < channel; i++) {
-    new_scale_ptr[i] = bn_scale_ptr[i] /
-                       static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
-    new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
-  }
-
-  int sub_conv_n = param->Strides()[0];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *  // NOLINT
-                                           sizeof(float));             // NOLINT
-  if (param->Groups() == channel) {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So;
-      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
-    }
-  } else {
-    for (int i = 0; i < channel * sub_conv_n; i++) {
-      bs_ptr[i + sub_conv_n * channel] =
-          new_scale_ptr[i % channel] * Si / So * Sf / 127.0f;
-      bs_ptr[i] = new_bias_ptr[i % (channel)] * 127.0f / So;
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
-                        "stride_width should be equal to stride_height ");
-  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
-                        "filter width should be equal to filter height ");
-  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
-                        "filter axis should be the multiple of stride axis ");
-  if (param->Groups() == channel) {
-    fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
-                               sub_conv_n);
-    fpga::DWDeconvArgs DWDeconv_arg = {0};
-    fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, true,
-                            param->Strides()[0], param->Strides()[1],
-                            param->Paddings()[0], param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(DWDeconv_arg);
-  } else {
-    fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
-    fpga::DeconvArgs deconv_arg = {0};
-    fpga::fill_deconv_arg(&deconv_arg, input, out, filter, true,
-                          param->Groups(), param->Strides()[0],
-                          param->Strides()[1], param->Paddings()[0],
-                          param->Paddings()[1], bs_ptr);
-    param->SetFpgaArgs(deconv_arg);
-  }
-  delete new_scale;
-  delete new_bias;
-  return true;
-}
-
-template <>
-void DeconvBNReluKernel<FPGA, float>::Compute(
-    const FusionDeconvBNReluParam<FPGA> &param) {
-  if (param.Groups() == param.Output()->dims()[1]) {
-    fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
-  } else {
-    fpga::ComputeFpgaDeconv(param.FpgaArgs());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
deleted file mode 100644
index 8b990d46e0b90bf67eaf36bbf38238fd4432ace6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/dropout_kernel.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DROPOUT_OP
-
-#include "operators/kernel/dropout_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DropoutKernel<FPGA, float>::Init(DropoutParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void DropoutKernel<FPGA, float>::Compute(const DropoutParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
deleted file mode 100644
index 54ae3b6712cbb1a5fac96fdee1e8bff84b0161be..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef ELEMENTWISEADD_OP
-#include "operators/kernel/elementwise_add_kernel.h"
-#include <math.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
-  auto input_x_ptr = input_x->data<int8_t>();
-  auto input_y_ptr = input_y->data<int8_t>();
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<int8_t>();
-  float Si_1 = input_x->scale[0];
-  float Si_2 = input_y->scale[0];
-  float So = out->scale[0];
-  float C1 = Si_1 / So;
-  float C2 = Si_2 / So;
-  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.const0 = 1;
-  ewaddArgs.const1 = 1;
-  ewaddArgs.relu_enabled = 0;
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  fpga::expand_EW_arg(&ewaddArgs);
-  param->SetFpgaArgs(ewaddArgs);
-  return true;
-}
-
-void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
-  int inputc = ewaddArgs.image0.channels;
-  int inputh = ewaddArgs.image0.height;
-  int inputw = ewaddArgs.image0.width;
-  float inScale0 =
-      (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
-  float inScale1 =
-      (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
-  float outScale =
-      (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
-  int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
-  int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
-  int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
-  int datasize = inputc * inputh * inputw;
-  float const0 = inScale0 / outScale;
-  float const1 = inScale1 / outScale;
-  fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
-  fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
-  for (int i = 0; i < datasize; i++) {
-    float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
-    int tmpI = static_cast<int>(round(tmpF));
-    outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI)));
-  }
-  fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
-}
-template <>
-void ElementwiseAddKernel<FPGA, float>::Compute(
-    const ElementwiseAddParam<FPGA> &param) {
-  // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-  ComputeCPUEWAdd(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
deleted file mode 100644
index c406a22d568404afae7238c579cfac5f172772b3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-#include "operators/kernel/elementwise_add_relu_kernel.h"
-#include <math.h>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ElementwiseAddReluKernel<FPGA, float>::Init(
-    ElementwiseAddReluParam<FPGA> *param) {
-  auto *input_x = const_cast<LoDTensor *>(param->InputX());
-  auto *input_y = const_cast<LoDTensor *>(param->InputY());
-  auto *out = param->Out();
-  auto input_x_ptr = input_x->data<int8_t>();
-  auto input_y_ptr = input_y->data<int8_t>();
-  fpga::format_ofm(out);
-  auto out_ptr = out->mutable_data<int8_t>();
-  float Si_1 = input_x->scale[0];
-  float Si_2 = input_y->scale[0];
-  float So = out->scale[0];
-  float C1 = Si_1 / So;
-  float C2 = Si_2 / So;
-  fpga::EWAddArgs ewaddArgs = {0};
-  ewaddArgs.relu_enabled = 1;
-  ewaddArgs.const0 = 1;
-  ewaddArgs.const1 = 1;
-  ewaddArgs.image0.address = input_x_ptr;
-  ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
-  ewaddArgs.image0.scale_address = input_x->scale;
-  ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
-  ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
-  ewaddArgs.image0.pad_height = 0;
-  ewaddArgs.image0.pad_width = 0;
-  ewaddArgs.image1.address = input_y_ptr;
-  ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
-  ewaddArgs.image1.scale_address = input_y->scale;
-  ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
-  ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
-  ewaddArgs.image1.pad_height = 0;
-  ewaddArgs.image1.pad_width = 0;
-  ewaddArgs.output.scale_address = out->scale;
-  ewaddArgs.output.address = out_ptr;
-  fpga::expand_EW_arg(&ewaddArgs);
-  param->SetFpgaArgs(ewaddArgs);
-  return true;
-}
-
-void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
-  int inputc = ewaddArgs.image0.channels;
-  int inputh = ewaddArgs.image0.height;
-  int inputw = ewaddArgs.image0.width;
-  float inScale0 =
-      (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
-  float inScale1 =
-      (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
-  float outScale =
-      (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
-  int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
-  int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
-  int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
-  int datasize = inputc * inputh * inputw;
-  float const0 = inScale0 / outScale;
-  float const1 = inScale1 / outScale;
-  fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
-  fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
-  for (int i = 0; i < datasize; i++) {
-    float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
-    int tmpI = static_cast<int>(round(tmpF));
-    outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI)));
-  }
-  fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
-}
-
-template <>
-void ElementwiseAddReluKernel<FPGA, float>::Compute(
-    const ElementwiseAddReluParam<FPGA> &param) {
-  // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
-  ComputeCPUEWAddRelu(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
deleted file mode 100644
index d1138d06bbed8ac8435e0a671a7683229d237da5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_mul_kernel.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ELEMENTWISEMUL_OP
-
-#include "operators/kernel/elementwise_mul_kernel.h"
-#include "operators/math/elementwise_op_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct MulFunctor {
-  inline T operator()(T a, T b) const { return a * b; }
-};
-template <>
-bool ElementwiseMulKernel<FPGA, float>::Init(ElementwiseMulParam<FPGA> *param) {
-  param->float_input_x.Resize(param->InputX()->dims());
-  param->float_input_x.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_input_x));
-
-  param->float_out.Resize(param->InputX()->dims());
-  param->float_out.init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(&(param->float_out));
-
-  auto *out = param->Out();
-  fpga::format_ofm(out);
-  return true;
-}
-
-template <>
-void ElementwiseMulKernel<FPGA, float>::Compute(
-    const ElementwiseMulParam<FPGA> &param) {
-  auto input_x = const_cast<LoDTensor *>(param.InputX());
-  auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
-  // auto intput_x_32_ptr =
-  // const_cast<float*>(param.float_input_x.data<float>());
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input_x->data<half>();
-  args.image.channels = (uint32_t)(input_x->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = intput_x_float->data<float>();
-  args.output.scale_address = intput_x_float->scale;
-  fpga::PerformBypass(args);
-  fpga::fpga_invalidate(args.output.address,
-                        input_x->fpga_data_num * sizeof(float));
-
-  auto input_y = param.InputY();
-  int axis = param.Axis();
-  auto out_float = const_cast<Tensor *>(&(param.float_out));
-  ElementwiseComputeEx<MulFunctor<float>, float>(
-      intput_x_float, input_y, axis, MulFunctor<float>(), out_float);
-  fpga::fpga_flush(out_float->data<float>(),
-                   input_x->fpga_data_num * sizeof(float));
-
-  Tensor *Out = param.Out();
-  args.input_data_type = fpga::DATA_TYPE_FP32;
-  args.output_data_type = fpga::DATA_TYPE_FP16;
-  args.input_layout_type = fpga::LAYOUT_CHW;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = out_float->data<float>();
-  args.image.channels = (uint32_t)(Out->fpga_data_num);
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.pad_height = 0;
-  args.image.pad_width = 0;
-  args.output.address = Out->data<half>();
-  args.output.scale_address = Out->scale;
-  fpga::PerformBypass(args);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
deleted file mode 100644
index b797b3faf8c9f659bfa3caab7ee5a759f997ce88..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/kernel/feed_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
-  auto output = param->Out();
-  if (output->dims().size() != 4) {
-    output->init(type_id<float>().hash_code());
-    return true;
-  }
-  fpga::format_ofm(output);
-  return true;
-}
-
-template <>
-void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
-  auto output = param.Out();
-  int col = param.Col();
-  auto input = const_cast<LoDTensor *>(&param.InputX()->at(col));
-  if (output->dims().size() != 4) {
-    size_t size = output->numel() * sizeof(float);
-    auto output_ptr = output->data<float>();
-    auto input_ptr = input->data<float>();
-    auto external_ptr = reinterpret_cast<float *>(input->external_data);
-    float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
-    memcpy(output_ptr, p_data, size);
-    input->external_data = nullptr;
-    return;
-  }
-  fpga::format_image(input);
-
-  auto output_ptr = output->data<int8_t>();
-  int channel = output->dims()[1];
-  int height = output->dims()[2];
-  int width = output->dims()[3];
-  int size = fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height;
-  auto input_ptr = input->data<int8_t>();
-  fpga::fpga_invalidate(input_ptr, size * sizeof(int8_t));
-  memcpy(output_ptr, input_ptr, size * sizeof(int8_t));
-
-  fpga::fpga_flush(output_ptr,
-                   fpga::align_to_x(channel * width, IMAGE_ALIGNMENT) * height *
-                       sizeof(int8_t));
-}
-template class FeedKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
deleted file mode 100644
index c6b8f9e85247865fd344bc86a365cdd26d3f5ec0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/fetch_kernel.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "operators/kernel/fetch_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FetchKernel<FPGA, float>::Init(FetchParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  int col = param->Col();
-  DLOG << "col = " << col;
-  auto output = &(param->Out()->at(col));
-  output->init(type_id<float>().hash_code());
-  output->mutable_data<float>(input->dims());
-
-  auto aligned_output = param->aligned_out;
-  int outC = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (alignedCW != unalignedCW) {
-    param->aligned_out = std::make_shared<Tensor>();
-    param->aligned_out->Resize(input->dims());
-    param->aligned_out->init(type_id<float>().hash_code());
-    fpga::format_ofm(param->aligned_out.get());
-  }
-  return true;
-}
-void dealign(float *src, float *dst, int input_c, int input_h, int input_w) {
-  int alignCW =
-      paddle_mobile::fpga::align_to_x(input_c * input_w, IMAGE_ALIGNMENT);
-  int dealignCW = input_c * input_w;
-  for (int h = 0; h < input_h; ++h) {
-    auto input_offset = h * alignCW;
-    auto output_offset = h * dealignCW;
-    memcpy((dst + output_offset), (src + input_offset),
-           dealignCW * sizeof(float));
-  }
-}
-template <>
-void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  int col = param.Col();
-  auto output = &param.Out()->at(col);
-  auto outdata_ptr = const_cast<float *>(output->data<float>());
-  int outC = 1;
-  int outH = 1;
-  int outW = 1;
-  if (output->dims().size() == 4) {
-    outC = output->dims()[1];
-    outH = output->dims()[2];
-    outW = output->dims()[3];
-  } else {  // 2
-    outC = output->dims()[1];
-  }
-  int unalignedCW = outC * outW;
-  int alignedCW = fpga::align_to_x(unalignedCW, IMAGE_ALIGNMENT);
-  if (input->type() == type_id<float>()) {
-    if ((output->dims().size() != 4) || (unalignedCW == alignedCW)) {
-      output->ShareDataWith(*input);
-    } else {
-      auto input_address = input->data<float>();
-      dealign(input_address, outdata_ptr, outC, outH, outW);
-      fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-    }
-
-    return;
-  }
-  auto input_address = input->data<int8_t>();
-  float Si = input->scale[0];
-
-  const int num_th = 32;
-  fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(int8_t));
-  if (input->fpga_data_num < num_th) {
-    for (int idx = 0; idx < product(input->dims()); ++idx) {
-      outdata_ptr[idx] = input_address[idx] / 127.0 * Si;
-    }
-    fpga::fpga_flush(outdata_ptr, product(input->dims()) * sizeof(float));
-    return;
-  }
-
-  auto aligned_out = param.aligned_out.get();
-  if (unalignedCW != alignedCW) {
-    auto aligned_ptr = aligned_out->data<float>();
-    fpga::fpga_invalidate(aligned_ptr, (input->fpga_data_num) * sizeof(float));
-    for (int idx = 0; idx < input->fpga_data_num; ++idx) {
-      aligned_ptr[idx] = input_address[idx] / 127.0 * Si;
-    }
-    dealign(aligned_ptr, outdata_ptr, outC, outH, outW);
-    fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-    return;
-  }
-  for (int idx = 0; idx < input->fpga_data_num; ++idx) {
-    outdata_ptr[idx] = input_address[idx] / 127.0 * Si;
-  }
-  fpga::fpga_flush(outdata_ptr, outC * outH * outW * sizeof(float));
-}
-template class FetchKernel<FPGA, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
deleted file mode 100644
index 4767b08e738c42ded78ee2429ed6a6a6a9720fa7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FC_OP
-
-#include "operators/kernel/fusion_fc_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
-  bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::NONE;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-  float Si = input_x->scale[0];
-  float Sf = fpga::filter_find_max(filter) / 127;
-  float So = out->scale[0];
-
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
-    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcKernel<FPGA, float>::Compute(const FusionFcParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
deleted file mode 100644
index 9748327355bba8d5d88fe32db556fe9e76cef79d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/fusion_fc_relu_kernel.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef FUSION_FCRELU_OP
-
-#include "operators/kernel/fc_relu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
-  bool relu_enabled = false;
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::LEAKYRELU;
-  int16_t leaky_relu_negative_slope = 0;
-  auto input_x = const_cast<LoDTensor *>(param->InputX());
-  auto filter = const_cast<LoDTensor *>(param->InputY());
-  const Tensor *input_z = param->InputZ();
-  auto input_z_ptr = input_z->data<float>();
-  auto out = param->Out();
-  float Si = input_x->scale[0];
-  float Sf = fpga::filter_find_max(filter) / 127;
-  float So = out->scale[0];
-
-  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr =
-      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
-  for (int i = 0; i < channel; i++) {
-    bs_ptr[i + channel] = Si / So * Sf / 127.0f;
-    bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
-  }
-  int num = (uint32_t)filter->dims()[1];
-  int chw = (uint32_t)filter->dims()[0];
-  PADDLE_MOBILE_ENFORCE(
-      chw == input_x->numel(),
-      "Filter element num should be equal to IFM element num");
-  int height = (uint32_t)input_x->dims()[2];
-  int width = (uint32_t)input_x->dims()[3];
-  int filter_channel = chw / height / width;
-
-  out->Resize(framework::make_ddim({1, channel, 1, 1}));
-  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  float max_value = fpga::filter_find_max(filter);
-  fpga::format_fc_filter(filter, max_value);
-
-  int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
-  fpga::format_ofm(out);
-
-  fpga::SplitConvArgs conv_arg = {0};
-  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,
-                       0, 0, bs_ptr);
-  param->SetFpgaArgs(conv_arg);
-  return true;
-}
-
-template <>
-void FusionFcReluKernel<FPGA, float>::Compute(
-    const FusionFcReluParam<FPGA> &param) {
-  fpga::ComputeFpgaConv(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
deleted file mode 100644
index aafc86d8888603ed9cd5b614ee1675c2e4fa2cf2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/pool_kernel.cpp
+++ /dev/null
@@ -1,106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef POOL_OP
-
-#include "operators/kernel/pool_kernel.h"
-
-class PoolingArgs;
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
-  auto *input = const_cast<LoDTensor *>(param->Input());
-  auto *output = param->Output();
-  vector<int> ksize = param->Ksize();
-  vector<int> strides = param->Strides();
-  vector<int> paddings = param->Paddings();
-  std::string pooling_type = param->PoolingType();
-
-  if (input->type() == type_id<float>()) {
-    int channels = input->dims()[1];
-    int height = input->dims()[2];
-    int width = input->dims()[3];
-    int num = input->dims()[0];
-    int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1;
-    int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1;
-    framework::DDim dim =
-        framework::make_ddim({num, channels, out_height, out_width});
-    output->mutable_data<float>(dim);
-    return true;
-  }
-
-  auto input_ptr = input->data<int8_t>();
-  fpga::format_ofm(output);
-  auto output_ptr = output->mutable_data<int8_t>();
-  float Si = input->scale[0];
-  float So = output->scale[0];
-
-  fpga::PoolingArgs poolArgs = {0};
-  poolArgs.mode = pooling_type == "max" ? 0 : 1;  // max:0, avg:1
-  poolArgs.kernel_reciprocal = fpga::fp32_2_fp16(
-      float(1.0 / (ksize[0] * ksize[1]) * Si / So));  // NOLINT
-  poolArgs.image.address = input_ptr;
-  poolArgs.image.channels = (uint32_t)input->dims()[1];
-  poolArgs.image.height = (uint32_t)input->dims()[2];
-  poolArgs.image.width = (uint32_t)input->dims()[3];
-  poolArgs.image.pad_height = (uint32_t)paddings[0];
-  poolArgs.image.pad_width = (uint32_t)paddings[1];
-  poolArgs.image.scale_address = input->scale;
-  poolArgs.output.address = output_ptr;
-  poolArgs.output.scale_address = output->scale;
-  poolArgs.kernel.height = (uint32_t)ksize[0];
-  poolArgs.kernel.width = (uint32_t)ksize[1];
-  poolArgs.kernel.stride_h = (uint32_t)strides[0];
-  poolArgs.kernel.stride_w = (uint32_t)strides[1];
-  param->SetFpgaArgs(poolArgs);
-  return true;
-}
-
-template <>
-void PoolKernel<FPGA, float>::Compute(const PoolParam<FPGA> &param) {
-  auto *input = const_cast<LoDTensor *>(param.Input());
-
-  if (input->type() == type_id<float>()) {
-    auto *output = param.Output();
-    auto in = input->data<float>();
-    auto N = input->dims()[0];
-    output->Resize(
-        {N, output->dims()[1], output->dims()[2], output->dims()[3]});
-    auto len = output->numel();
-    auto out = output->mutable_data<float>();
-    int C = input->dims()[1], H = input->dims()[2],  // N = input->dims()[0],
-        W = input->dims()[3];
-    int HW = H * W, CHW = C * H * W, WC = W * C;
-
-    for (int n = 0; n < N; n++) {
-      for (int c = 0; c < C; c++) {
-        out[n * C + c] = 0;
-        for (int h = 0; h < H; h++) {
-          for (int w = 0; w < W; w++) {
-            out[n * C + c] += in[n * CHW + h * WC + w * C +
-                                 c];  // in[n * CHW + c * HW + h * W + w]; //
-          }
-        }
-        out[n * C + c] /= HW;
-      }
-    }
-    return;
-  }
-  fpga::ComputeFpgaPool(param.FpgaArgs());
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
deleted file mode 100644
index c2f8b55c1eef1c9aa1e46bee5e30ffaa7525fa2a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ /dev/null
@@ -1,452 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PROPOSAL_OP
-
-#include <algorithm>
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-template <>
-bool ProposalKernel<FPGA, float>::Init(ProposalParam<FPGA> *param) {
-  int post_nms_top_n = param->post_nms_topn_;
-  int64_t batch = param->scores_->dims()[0];
-  auto total = post_nms_top_n * batch;
-  param->rpn_rois_->mutable_data<float>({total, 4});
-  param->rpn_probs_->mutable_data<int8_t>({total, 1});
-
-  param->float_bbox = std::make_shared<Tensor>();
-  param->float_bbox->Resize(param->bbox_deltas_->dims());
-  param->float_bbox->init(type_id<float>().hash_code());
-  fpga::format_fp32_ofm(param->float_bbox.get());
-
-  auto input = param->scores_;
-  param->score_index_ = std::make_shared<Tensor>();
-  param->score_index_->mutable_data<int32_t>({input->numel()});
-  auto score_index = param->score_index_->data<int32_t>();
-  for (int i = 0; i < input->numel(); ++i) {
-    score_index[i] = i;
-  }
-
-  return true;
-}
-template <typename T>
-void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) {
-  PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 ||
-                            (index.dims().size() == 2 && index.dims()[1] == 1),
-                        "Dim not correct");
-  int64_t index_size = index.dims()[0];
-
-  auto src_dims = src.dims();
-
-  const T *p_src = src.data<T>();
-  const int *p_index = index.data<int>();
-  T *p_output = output->data<T>();
-
-  // slice size
-  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
-
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int64_t i = 0; i < index_size; ++i) {
-    int index_ = p_index[i];
-    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
-  }
-}
-
-void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = dst->data<void>();
-  auto *to_add_data = src.data<void>();
-  size_t size_of_t = framework::SizeOfType(src.type());
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data, src.numel() * size_of_t);
-}
-
-template <class T>
-static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas,
-                            Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>();
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-    bbox_center_y =
-        bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-    bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
-    bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
-  }
-}
-
-template <class T>
-static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>();
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static inline void FilterBoxes(Tensor *boxes, float min_size,
-                               const Tensor &im_info, Tensor *keep) {
-  const T *im_info_data = im_info.data<T>();
-  T *boxes_data = boxes->mutable_data<T>();
-  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0]});
-  min_size = std::max(min_size, 1.0f);
-  int *keep_data = keep->mutable_data<int>();
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-template <class T>
-static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static inline T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <typename T>
-static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                                    int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  auto *keep_data = keep_nms.mutable_data<T>();
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <class T>
-static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold,
-                         float eta, int post_nms_num = 100) {
-  int64_t num_boxes = bbox->dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox->dims()[1];
-
-  std::vector<int8_t> scores_data(num_boxes);
-  std::copy_n(scores->data<int8_t>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<int8_t, int>> sorted_indices =
-      GetSortedScoreIndex<int8_t>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while ((sorted_indices.size() != 0) && (selected_num < post_nms_num)) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
-template <typename T>
-std::pair<Tensor, Tensor> ProposalForOneImage(
-    const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances,
-    const Tensor &bbox_deltas_slice,  // [M, 4]
-    const Tensor &scores_slice,       // [N, 1]
-    const Tensor &score_index, int pre_nms_top_n, int post_nms_top_n,
-    float nms_thresh, float min_size, float eta) {
-  auto *scores_data = scores_slice.data<int8_t>();
-  // Sort index
-  Tensor index_t;
-  index_t.Resize({scores_slice.numel()});
-  int *index = index_t.mutable_data<int>();
-  std::memcpy(index, score_index.data<int32_t>(),
-              scores_slice.numel() * sizeof(int));
-
-  auto compare = [scores_data](const int64_t &i, const int64_t &j) {
-    return scores_data[i] > scores_data[j];
-  };
-
-  if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
-    std::sort(index, index + scores_slice.numel(), compare);
-  } else {
-    std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(),
-                     compare);
-    index_t.Resize({pre_nms_top_n});
-  }
-
-  Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
-  scores_sel.mutable_data<int8_t>({index_t.numel(), 1});
-  bbox_sel.mutable_data<T>({index_t.numel(), 4});
-  anchor_sel.mutable_data<T>({index_t.numel(), 4});
-  var_sel.mutable_data<T>({index_t.numel(), 4});
-
-  CPUGather<int8_t>(scores_slice, index_t, &scores_sel);
-  CPUGather<T>(bbox_deltas_slice, index_t, &bbox_sel);
-  CPUGather<T>(anchors, index_t, &anchor_sel);
-  Tensor proposals;
-  proposals.mutable_data<T>({index_t.numel(), 4});
-  BoxCoder<T>(&anchor_sel, &bbox_sel, &proposals);
-
-  ClipTiledBoxes<T>(im_info_slice, &proposals);
-
-  Tensor keep;
-  FilterBoxes<T>(&proposals, min_size, im_info_slice, &keep);
-
-  Tensor scores_filter;
-  bbox_sel.mutable_data<T>({keep.numel(), 4});
-  scores_filter.mutable_data<int8_t>({keep.numel(), 1});
-
-  CPUGather<T>(proposals, keep, &bbox_sel);
-  CPUGather<int8_t>(scores_sel, keep, &scores_filter);
-  if (nms_thresh <= 0) {
-    return std::make_pair(bbox_sel, scores_filter);
-  }
-
-  Tensor keep_nms =
-      NMS<T>(&bbox_sel, &scores_filter, nms_thresh, eta, post_nms_top_n);
-
-  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
-    keep_nms.Resize({post_nms_top_n});
-  }
-
-  proposals.mutable_data<T>({keep_nms.numel(), 4});        // original
-  scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1});  // original
-
-  CPUGather<T>(bbox_sel, keep_nms, &proposals);
-  CPUGather<int8_t>(scores_filter, keep_nms, &scores_sel);
-  return std::make_pair(proposals, scores_sel);
-}
-
-template <>
-void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
-  auto input_score = param.scores_;
-  auto input_score_data = input_score->data<int8_t>();
-  uint32_t score_n, score_height, score_width, score_channels;
-
-  auto input_bbox = param.bbox_deltas_;
-  auto input_bbox_data = input_bbox->data<int8_t>();
-  uint32_t bbox_n, bbox_height, bbox_width, bbox_channels;
-
-  score_n = (uint32_t)(input_score->dims()[0]);
-  score_channels = (uint32_t)(input_score->dims()[1]);
-  score_height = (uint32_t)(input_score->dims()[2]);
-  score_width = (uint32_t)(input_score->dims()[3]);
-
-  bbox_n = (uint32_t)(input_bbox->dims()[0]);
-  bbox_channels = (uint32_t)(input_bbox->dims()[1]);
-  bbox_height = (uint32_t)(input_bbox->dims()[2]);
-  bbox_width = (uint32_t)(input_bbox->dims()[3]);
-
-  int64_t amount_per_side = score_width * score_height;
-
-  int alignedCW =
-      fpga::align_to_x(score_width * score_channels, IMAGE_ALIGNMENT);
-  int unalignedCW = score_width * score_channels;
-  fpga::fpga_invalidate(input_score_data,
-                        score_height * alignedCW * sizeof(int8_t));
-
-  Tensor score_tensor = *input_score;
-  for (int h = 0; h < score_height; h++) {
-    for (int w = 0; w < score_width; w++) {
-      for (int c = 0; c < score_channels; ++c) {
-        int dstidx = h * unalignedCW + w * score_channels + c;
-        int srcidx = h * alignedCW + w * score_channels + c;
-        score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
-      }
-    }
-  }
-
-  amount_per_side = bbox_width * bbox_height;
-  alignedCW = fpga::align_to_x(bbox_width * bbox_channels, IMAGE_ALIGNMENT);
-  unalignedCW = bbox_width * bbox_channels;
-  fpga::fpga_invalidate(input_bbox_data,
-                        bbox_height * alignedCW * sizeof(int8_t));
-
-  auto bbox_tensor = param.float_bbox.get();
-  for (int h = 0; h < bbox_height; h++) {
-    for (int w = 0; w < bbox_width; w++) {
-      for (int c = 0; c < bbox_channels; ++c) {
-        int dstidx = h * unalignedCW + w * bbox_channels + c;
-        int srcidx = h * alignedCW + w * bbox_channels + c;
-        bbox_tensor->data<float>()[dstidx] =
-            (static_cast<int>(input_bbox_data[srcidx])) / 127.0 *
-            input_bbox->scale[0];
-      }
-    }
-  }
-  auto *im_info = param.im_info_;
-  auto anchors = *param.anchors_;
-  auto variances = *param.variances_;
-
-  auto *rpn_rois = param.rpn_rois_;
-  auto *rpn_roi_probs = param.rpn_probs_;
-
-  auto score_index = *(param.score_index_.get());
-
-  int pre_nms_top_n = param.pre_nms_topn_;
-  int post_nms_top_n = param.post_nms_topn_;
-
-  float nms_thresh = param.nms_thresh_ / 2.0f;
-  float min_size = param.min_size_;
-  float eta = param.eta_;
-
-  rpn_rois->mutable_data<float>({bbox_tensor->numel() / 4, 4});
-  rpn_roi_probs->mutable_data<int8_t>({input_score->numel() / 4, 1});
-  framework::LoD lod;
-  lod.resize(1);
-  auto &lod0 = lod[0];
-  lod0.push_back(0);
-  anchors.Resize({anchors.numel() / 4, 4});
-  variances.Resize({variances.numel() / 4, 4});
-
-  int64_t num_proposals = 0;
-  for (int64_t i = 0; i < score_n; ++i) {
-    Tensor im_info_slice = im_info->Slice(i, i + 1);
-    Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1);
-    Tensor scores_slice = score_tensor.Slice(i, i + 1);
-
-    bbox_deltas_slice.Resize({bbox_height * bbox_width * bbox_channels / 4, 4});
-    scores_slice.Resize({score_height * score_width * score_channels, 1});
-    std::pair<Tensor, Tensor> tensor_pair = ProposalForOneImage<float>(
-        im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice,
-        score_index, pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta);
-    Tensor &proposals = tensor_pair.first;
-    Tensor &scores = tensor_pair.second;
-
-    AppendProposals(rpn_rois, 4 * num_proposals, proposals);
-    AppendProposals(rpn_roi_probs, num_proposals, scores);
-    num_proposals += proposals.dims()[0];
-    lod0.push_back(num_proposals);
-  }
-  rpn_rois->set_lod(lod);
-  rpn_roi_probs->set_lod(lod);
-  rpn_rois->Resize({num_proposals, 4});
-  rpn_roi_probs->Resize({num_proposals, 1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PROPOSAL_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
deleted file mode 100644
index 00c0b5d6317dfa456cb9a285390b13a4397b5b61..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PSROI_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V2/api.h"
-#include "fpga/V2/image.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool PSRoiPoolKernel<FPGA, float>::Init(PSRoiPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-
-  param->output_->mutable_data<float>(dims_out_new);
-  return true;
-}
-
-template <typename Dtype>
-void PSROIPoolingForward(const int8_t* bottom_data, const int height,
-                         const int width, const int input_channel,
-                         Dtype* top_data, const int pooled_height,
-                         const int pooled_width, const int output_channel,
-                         const Dtype* bottom_rois, const Dtype Bin_size_h,
-                         const Dtype Bin_size_w, const Dtype roi_start_h,
-                         const Dtype roi_start_w, const int pw, const int ph,
-                         float scale, const int roi_batch_ind) {
-  int hstart = floor(static_cast<Dtype>(ph) * Bin_size_h + roi_start_h);
-  int wstart = floor(static_cast<Dtype>(pw) * Bin_size_w + roi_start_w);
-  int hend = ceil(static_cast<Dtype>(ph + 1) * Bin_size_h + roi_start_h);
-  int wend = ceil(static_cast<Dtype>(pw + 1) * Bin_size_w + roi_start_w);
-
-  // Add roi offsets and clip to input boundaries
-  hstart = std::min(std::max(hstart, 0), height);
-  hend = std::min(std::max(hend, 0), height);
-  wstart = std::min(std::max(wstart, 0), width);
-  wend = std::min(std::max(wend, 0), width);
-  bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-  float avg_pixels_c[output_channel] = {0};
-  int sum_pixels_c[output_channel] = {0};
-  int8_t pixels_c[output_channel] = {0};
-  if (!is_empty) {
-    Dtype bin_area = (hend - hstart) * (wend - wstart);
-    float scale_fuse = scale / bin_area;
-
-    for (int h = hstart; h < hend; ++h) {
-      for (int w = wstart; w < wend; ++w) {
-        int pixel_offset = (h * width + w) * input_channel;
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          int input_channel_offset = output_c * pooled_height * pooled_width;
-          int input_bias =
-              pixel_offset + input_channel_offset + ph * pooled_width + pw;
-          pixels_c[output_c] = bottom_data[input_bias];
-        }
-
-        for (int output_c = 0; output_c < output_channel; output_c++) {
-          sum_pixels_c[output_c] += pixels_c[output_c];
-        }
-      }
-    }
-    for (int output_c = 0; output_c < output_channel; output_c++) {
-      avg_pixels_c[output_c] = sum_pixels_c[output_c] * scale_fuse;
-    }
-  }
-
-  int output_index_base = (ph * pooled_width + pw) * output_channel;
-  top_data += output_index_base;
-  memcpy(top_data, avg_pixels_c, output_channel * 4);
-}
-
-template <>
-void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
-  auto input_tensor = param.input_x_;
-  auto input_data = input_tensor->data<int8_t>();
-  auto scale = input_tensor->scale[0] / 127.0;
-  fpga::fpga_invalidate(input_data, input_tensor->numel() * sizeof(int8_t));
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto output_channels = param.output_channels_;
-
-  auto in_dims = input_tensor->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-
-  (param.output_)->Resize(dims_out_new);
-
-  framework::Tensor rois_batch_id_list;
-  rois_batch_id_list.Resize({rois_num});
-  auto rois_batch_id_data = rois_batch_id_list.mutable_data<int>();
-
-  PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty");
-
-  auto rois_lod = rois->lod().back();
-  int rois_batch_size = rois_lod.size() - 1;
-  PADDLE_MOBILE_ENFORCE(
-      rois_batch_size == batch_size,
-      "the rois_batch_size and input(X) batch_size should be the same.");
-  int rois_num_with_lod = rois_lod[rois_batch_size];
-  PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num,
-                        "the rois_num from input and lod must be the same");
-
-  PADDLE_MOBILE_ENFORCE(
-      input_channels == output_channels * pooled_height * pooled_width,
-      "the channels of input X should equal the product of "
-      "output_channels x pooled_height x pooled_width");
-
-  auto output_data = out->mutable_data<float>();
-  auto input_rois = rois->data<float>();
-
-  for (int n = 0; n < rois_num; ++n) {
-    auto offset_input_rois = input_rois + n * 4;
-    auto offset_output_data =
-        output_data + pooled_height * pooled_width * output_channels * n;
-
-    auto roi_start_w =
-        static_cast<float>(round(offset_input_rois[0])) * spatial_scale;
-    auto roi_start_h =
-        static_cast<float>(round(offset_input_rois[1])) * spatial_scale;
-    auto roi_end_w =
-        static_cast<float>(round(offset_input_rois[2]) + 1.) * spatial_scale;
-    auto roi_end_h =
-        static_cast<float>(round(offset_input_rois[3]) + 1.) * spatial_scale;
-
-    // Force too small rois to be 1 x 1
-    auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f);  // avoid 0
-    auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f);
-
-    // Compute bin size w and h at input feature map
-    auto bin_size_h = roi_height / static_cast<float>(pooled_height);
-    auto bin_size_w = roi_width / static_cast<float>(pooled_width);
-
-    int roi_batch_ind = rois_batch_id_data[n];
-
-    for (int ph = 0; ph < pooled_height; ph++) {
-      for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(input_data, height, width, input_channels,
-                                   offset_output_data, pooled_height,
-                                   pooled_width, output_channels, input_rois,
-                                   bin_size_h, bin_size_w, roi_start_h,
-                                   roi_start_w, pw, ph, scale, roi_batch_ind);
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PSROI_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
deleted file mode 100644
index 6fff10f6206212379e865f2041e3d35cca955bfd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/relu_kernel.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RELU_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReluKernel<FPGA, float>::Init(ReluParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  return true;
-}
-
-template <>
-void ReluKernel<FPGA, float>::Compute(const ReluParam<FPGA> &param) {}
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
deleted file mode 100644
index 5b651ad6e66409805759c82477161127f5045dea..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/kernel/reshape2_kernel.h"
-#include "framework/ddim.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto output = param->Out();
-  auto shape = param->Shape();
-  output->scale[0] = input->scale[0];
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-  output->set_type(input->type());
-  fpga::format_ofm(output);
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-
-  return true;
-}
-
-void reshape(LoDTensor *input, LoDTensor *output) {
-  // Subscript r means after reshape
-
-  auto input_ptr = input->data<int8_t>();
-  auto output_ptr = output->data<int8_t>();
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  auto C = static_cast<int>(input->dims()[1]);
-  auto H = static_cast<int>(input->dims()[2]);
-  auto W = static_cast<int>(input->dims()[3]);
-  auto Cr = static_cast<int>(output->dims()[1]);
-  auto Hr = static_cast<int>(output->dims()[2]);
-  auto Wr = static_cast<int>(output->dims()[3]);
-  PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match");
-  auto WC = W * C;
-  auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT);
-  auto HW = H * W;
-  auto WCr = Wr * Cr;
-  auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
-  auto HWr = Hr * Wr;
-
-  fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(int8_t));
-
-  int offset_align = 0;
-  int offset_r = 0, offset_align_r = 0;
-  int cr = 0, hr = 0, wr = 0;
-
-  for (int h = 0; h < H; h++) {
-    int offset0 = h * WC_align;
-    for (int w = 0; w < W; w++) {
-      int offset1 = w * C + offset0;
-      for (int c = 0; c < C; c++) {
-        offset_align = offset1 + c;
-        offset_r = c * HW + h * W + w;
-        cr = offset_r / HWr;
-        hr = offset_r % HWr / Wr;
-        wr = offset_r % Wr;
-        offset_align_r = hr * WCr_align + wr * Cr + cr;
-        output_ptr[offset_align_r] = input_ptr[offset_align];
-      }
-    }
-  }
-
-  fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(int8_t));
-}
-
-template <>
-void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
-  auto input = const_cast<LoDTensor *>(param.InputX());
-  auto output = param.Out();
-  auto shape = param.Shape();
-
-  auto num_in = framework::product(input->dims());
-  auto num_shape = framework::product(framework::make_ddim(shape));
-  PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported");
-
-  for (int i = 0; i < shape.size(); i++) {
-    if (shape[i] == -1) {
-      shape[i] = static_cast<int>(-num_in / num_shape);
-      break;
-    }
-  }
-  output->Resize(framework::make_ddim(shape));
-
-  bool reshapeNeedFlg = 1;
-  if (output->dims() == input->dims()) {
-    reshapeNeedFlg = 0;
-  } else if (output->dims().size() != input->dims().size()) {
-    auto inputdimsize = input->dims().size();
-    auto outputdimsize = output->dims().size();
-    int smallersize =
-        inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
-    int i = 0;
-    for (i = 0; i < smallersize; i++) {
-      if ((input->dims())[i] != (output->dims())[i]) break;
-    }
-    if (i == smallersize) {
-      reshapeNeedFlg = 0;
-    }
-  }
-  if (reshapeNeedFlg) {
-    reshape(input, output);
-  } else {
-    DLOG << "No need to reshape";
-    output->ShareDataWith(*input);
-    framework::LoD lod = input->lod();
-    output->set_lod(lod);
-    output->scale[0] = input->scale[0];
-    return;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
deleted file mode 100644
index 5e01bb74bab6996ca59632ae31f37ecfeafc918c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/reshape_kernel.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/kernel/reshape_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReshapeKernel<FPGA, float>::Init(ReshapeParam<FPGA> *param) {
-  param->Out()->ShareDataWith(*param->InputX());
-  const int in_n = param->InputX()->dims()[0];
-  const int in_c = param->InputX()->dims()[1];
-  const int in_h = param->InputX()->dims()[2];
-  const int in_w = param->InputX()->dims()[3];
-  auto out = param->Out();
-  out->Resize(framework::make_ddim({in_n, in_c * in_h * in_w}));
-  return true;
-}
-
-template <>
-void ReshapeKernel<FPGA, float>::Compute(const ReshapeParam<FPGA> &param) {}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
deleted file mode 100644
index 985f0fc94ce408b14581d7d6fa2ab1c955da08fb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/roialign_pool_kernel.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ROIALIGN_POOL_OP
-
-#include <cmath>
-#include <vector>
-#include "operators/kernel/detection_kernel.h"
-
-#include "fpga/V2/api.h"
-#include "fpga/V2/image.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool RoiAlignPoolKernel<FPGA, float>::Init(RoiAlignPoolParam<FPGA>* param) {
-  auto dims = param->input_x_->dims();
-  PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0,
-                        "data not aligned");
-
-  param->float_input = std::make_shared<Tensor>();
-  param->float_input->mutable_data<float>(param->input_x_->dims());
-
-  auto input = param->input_x_;
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input->data<half>();
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = param->float_input->mutable_data<float>();
-  args.output.scale_address = param->float_input->scale;
-  param->input_arg = args;
-
-  auto* rois = param->input_rois_;
-  int rois_num = rois->dims()[0];
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, param->output_->dims()[1], param->output_->dims()[2],
-       param->output_->dims()[3]});
-  param->output_->Resize(dims_out_new);
-
-  param->output_->mutable_data<float>(dims_out_new);
-
-  return true;
-}
-
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w,
-    std::vector<PreCalc<T>>& pre_calc) {  // NOLINT
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = static_cast<int>(y);
-          int x_low = static_cast<int>(x);
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indeces
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* bottom_data,
-                     const T& spatial_scale, const int channels,
-                     const int height, const int width, const int pooled_height,
-                     const int pooled_width, const int sampling_ratio,
-                     const T* bottom_rois, T* top_data) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    // roi could have 4 or 5 columns
-    const T* offset_bottom_rois = bottom_rois + n * 4;
-    int roi_batch_ind = 0;
-    // if (roi_cols == 5) {
-    // roi_batch_ind = offset_bottom_rois[0];
-    // offset_bottom_rois++;
-    // }
-
-    // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
-    // T roi_start_w = round(offset_bottom_rois[0] * spatial_scale);
-    // T roi_start_h = round(offset_bottom_rois[1] * spatial_scale);
-    // T roi_end_w = round(offset_bottom_rois[2] * spatial_scale);
-    // T roi_end_h = round(offset_bottom_rois[3] * spatial_scale);
-
-    // Force malformed ROIs to be 1x1
-    T roi_width = std::max(roi_end_w - roi_start_w, (T)1.);
-    T roi_height = std::max(roi_end_h - roi_start_h, (T)1.);
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    // we want to precalculate indeces and weights shared by all chanels,
-    // this is the key point of optimiation
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_bottom_data =
-          bottom_data + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_bottom_data[pc.pos1] +
-                            pc.w2 * offset_bottom_data[pc.pos2] +
-                            pc.w3 * offset_bottom_data[pc.pos3] +
-                            pc.w4 * offset_bottom_data[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          top_data[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <>
-void RoiAlignPoolKernel<FPGA, float>::Compute(
-    const RoiAlignPoolParam<FPGA>& param) {
-  auto input_tensor = param.float_input.get();
-  fpga::PerformBypass(param.input_arg);
-  fpga::fpga_invalidate(input_tensor->data<float>(),
-                        input_tensor->numel() * sizeof(float));
-
-  auto* in = input_tensor;
-  auto* rois = param.input_rois_;
-  auto* out = param.output_;  // param.float_output.get();
-
-  auto pooled_height = param.pooled_height_;
-  auto pooled_width = param.pooled_width_;
-  auto spatial_scale = param.spatial_scale_;
-  auto sampe_ratio = param.sampling_ratio_;
-
-  auto in_dims = in->dims();
-  int batch_size = in_dims[0];
-  int input_channels = in_dims[1];
-  int height = in_dims[2];
-  int width = in_dims[3];
-  int rois_num = rois->dims()[0];
-
-  auto data_nhwc = in->mutable_data<float>();
-
-  fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width);
-  framework::DDim dims_out_new = framework::make_ddim(
-      {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])),
-       (param.output_)->dims()[3]});
-  (param.output_)->Resize(dims_out_new);
-
-  const int index = input_channels * pooled_height * pooled_width * rois_num;
-  auto rois_data = rois->data<float>();
-  auto top_data = param.output_->mutable_data<float>();
-  for (int i = 0; i < index; ++i) {
-    ROIAlignForward<float>(index, data_nhwc, spatial_scale, input_channels,
-                           height, width, pooled_height, pooled_width,
-                           sampe_ratio, rois_data, top_data);
-  }
-
-  fpga::image::convert_to_hwc(&top_data, input_channels, pooled_height,
-                              pooled_width, rois_num);
-  out->reset_data_ptr(top_data);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ROIALIGN_POOL_OP
diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
deleted file mode 100644
index 44aae4be321db6797d3450cec7c2f159b5e5124b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SIGMOID_OP
-
-#include "operators/kernel/activation_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto input_ptr = input->data<int8_t>();
-  paddle_mobile::fpga::ActivationType activation_enable =
-      paddle_mobile::fpga::SIGMOID;
-  int16_t leaky_relu_negative_slope =
-      fpga::fp32_2_fp16(input->scale[0] / 127.0);
-  auto out = param->Out();
-  fpga::format_ofm(out);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_INT8};
-  args.input_data_type = fpga::DATA_TYPE_INT8;
-  args.output_data_type = fpga::DATA_TYPE_INT8;
-  args.image.address = input_ptr;
-  args.image.height = 1;
-  args.image.width = 1;
-  args.image.channels = input->fpga_data_num;
-  args.output.address = out->data<int8_t>();
-  args.output.scale_address = out->scale;
-  args.output.activation.activation_type = activation_enable;
-  args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
-  param->SetFpgaArgs(args);
-  return true;
-}
-
-template <>
-void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
-  fpga::PerformBypass(param.FpgaArgs());
-  param.Out()->scale[0] = 1.0;
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
deleted file mode 100644
index e40242d5c201345a3f2c1031a4a03b3095d9ff65..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/kernel/slice_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
-  auto output = param->output_;
-  fpga::format_ofm(output);
-  DLOG << "input: " << param->input_;
-  DLOG << "output: " << param->output_;
-  if (param->input_->type() != type_id<int8_t>()) {
-    DLOG << "wrong type";
-  }
-  return true;
-}
-
-template <>
-void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
-  // Only support slicing in channel dimension
-  // Only support half data
-  // W must be aligned to 16
-
-  auto input = param.input_;
-  auto output = param.output_;
-  int H = input->dims()[2];
-  int W = input->dims()[3];
-  int HW = input->dims()[2] * input->dims()[3];
-  int channel = input->dims()[1];
-  auto input_ptr = input->data<int8_t>();
-  auto output_ptr = output->data<int8_t>();
-
-  output->scale[0] = input->scale[0];
-  output->scale[1] = input->scale[1];
-
-  int start = param.starts_[0], end = param.ends_[0];
-  start = start < 0 ? start + channel : start;
-  end = end < 0 ? end + channel : end;
-  start = start > channel ? channel : start;
-  end = end > channel ? channel : end;
-  int len = end - start;
-  size_t size = len * sizeof(int8_t);
-  DLOG << input->fpga_data_num;
-  fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t));
-  DLOG << output->fpga_data_num;
-  fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t));
-  int unalignedWC = len * W;
-  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
-
-  if (unalignedWC != alignedWC) {
-    auto tmpOutput =
-        reinterpret_cast<int8_t*>(fpga::fpga_malloc(len * HW * sizeof(int8_t)));
-    for (int i = 0; i < HW; i++) {
-      memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
-    }
-    for (int i = 0; i < H; i++) {
-      for (int j = 0; j < unalignedWC; j++) {
-        *(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j);
-      }
-    }
-    fpga::fpga_free(tmpOutput);
-  } else {
-    for (int i = 0; i < HW; i++) {
-      memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
-    }
-  }
-  fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t));
-}
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
deleted file mode 100755
index 843f249c683717789999db733a04b3da0198bdcb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/kernel/softmax_kernel.h"
-#include "operators/kernel/central-arm-func/softmax_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  auto dims = framework::vectorize(input->dims());
-
-  auto out = param->Out();
-  out->Resize(framework::make_ddim(dims));
-
-  int input_c = 1, input_h = 1, input_w = 1;
-  if (dims.size() == 4) {
-    input_h = dims[1];
-    input_w = dims[2];
-    input_c = dims[3];
-    if (input_c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(input_w == 1, "Softmax input must come from FC op");
-      input_c = dims[1];
-      input_h = 1;
-    }
-  } else if (dims.size() == 2) {
-    input_c = dims[1];
-  }
-
-  input->Resize(framework::make_ddim(dims));
-  if ((input_c == 2) && (input->type() == type_id<int8_t>())) {
-    auto input_ptr = input->data<int8_t>();
-    float Si = input->scale[0];
-    int16_t slope = fpga::fp32_2_fp16(Si / 127);
-    out->mutable_data<int8_t>(framework::make_ddim(dims));
-    fpga::format_ofm(out);
-    fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-    args.input_layout_type = fpga::LAYOUT_HWC;
-    args.output_layout_type = fpga::LAYOUT_CHW;
-    args.input_data_type = fpga::DATA_TYPE_FP16;
-    args.output_data_type = fpga::DATA_TYPE_FP16;
-    args.image.address = input_ptr;
-    args.image.height = input_h;
-    args.image.width = input_w;
-    args.image.channels = input_c;
-    args.output.address = out->data<int8_t>();
-    args.output.scale_address = out->scale;
-    args.output.activation.activation_type = fpga::SOFTMAX;
-    args.output.activation.leaky_relu_negative_slope = slope;
-    param->SetFpgaArgs(args);
-  } else {
-    out->mutable_data<float>(framework::make_ddim(dims));
-    fpga::format_ofm(out);
-  }
-
-  return true;
-}
-
-template <>
-void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
-  auto *in_x = (param.InputX());
-  auto dims = in_x->dims();
-
-  auto n = 1;
-  auto h = 1;
-  auto w = 1;
-  auto c = 1;
-  if (dims.size() == 4) {
-    n = dims[0];
-    h = dims[1];
-    w = dims[2];
-    c = dims[3];
-    if (c == 1) {  // This input is generated by FC op, dims = [N C 1 1]
-      PADDLE_MOBILE_ENFORCE(w == 1, "Softmax input must come from FC op");
-      c = dims[1];
-      h = 1;
-    }
-  } else if (dims.size() == 2) {
-    n = dims[0];
-    c = dims[1];
-  }
-  if ((c == 2) && (in_x->type() == type_id<int8_t>())) {
-    fpga::PerformBypass(param.FpgaArgs());
-  } else if (in_x->type() == type_id<int8_t>()) {
-    auto in_data = in_x->data<int8_t>();
-    float Si = in_x->scale[0];
-    Tensor *out = param.Out();
-    out->Resize({n, h, w, c});
-    auto float_input_x = param.float_input_x_;
-    float_input_x = std::make_shared<Tensor>();
-    float_input_x->Resize(in_x->dims());
-    float_input_x->init(type_id<float>().hash_code());
-    fpga::format_fp32_ofm(float_input_x.get());
-    auto float_input_x_data = float_input_x->data<float>();
-    int dataNum = n * h * fpga::align_to_x(w * c, IMAGE_ALIGNMENT);
-    for (int i = 0; i < dataNum; i++) {
-      float_input_x_data[i] = in_data[i] * Si / 127;
-    }
-    math::SoftmaxFuntor<CPU, float>()(float_input_x.get(), out);
-  } else {
-    Tensor *out = param.Out();
-    out->Resize({n, h, w, c});
-    math::SoftmaxFuntor<CPU, float>()(in_x, out);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
deleted file mode 100644
index af3fe9df00e8d8de5485793c9f4f1b887726f0fb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/split_kernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#include "operators/kernel/split_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <>
-bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
-  auto *in = const_cast<LoDTensor *>(param->InputX());
-  auto outs = param->Outs();
-  auto sections = param->Sections();
-  int axis = param->Axis();
-  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
-  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
-                        "Output number should be equal to section number");
-  auto image_num = (uint32_t)outs.size();
-  auto images_out =
-      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
-  auto scales_out = reinterpret_cast<float **>(
-      fpga::fpga_malloc(image_num * sizeof(float *)));
-  auto out_channels = reinterpret_cast<uint32_t *>(
-      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
-  DLOG << "input: " << in;
-  for (int i = 0; i < image_num; i++) {
-    fpga::format_ofm(outs[i]);
-    DLOG << "output: " << outs[i];
-    images_out[i] = outs[i]->mutable_data<int8_t>();
-    scales_out[i] = outs[i]->scale;
-    out_channels[i] = (uint32_t)sections[i];
-  }
-
-  auto deleter = [](void *p) { fpga::fpga_free(p); };
-
-  fpga::SplitArgs arg = {0};
-  arg.image_num = image_num;
-  arg.image_in = in->data<int8_t>();
-  arg.scale_in = in->scale;
-  arg.images_out = images_out;
-  arg.scales_out = scales_out;
-  arg.out_channel_nums = out_channels;
-  arg.height = (uint32_t)in->dims()[2];
-  arg.width = (uint32_t)in->dims()[3];
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(images_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(scales_out), deleter));
-  arg.vector_split_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(out_channels), deleter));
-
-  param->SetFpgaArgs(arg);
-  return true;
-}
-template <>
-void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
-  fpga::ComputeFPGASplit(param.FpgaArgs());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
deleted file mode 100644
index 670689e08375dc93dc2b48eb451dff299231672d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/tanh_kernel.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TANH_OP
-
-#include "operators/kernel/tanh_kernel.h"
-#include <math.h>
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool TanhKernel<FPGA, float>::Init(TanhParam<FPGA> *param) {
-  auto input = const_cast<LoDTensor *>(param->InputX());
-  DLOG << "input: " << input;
-  auto input_ptr = input->data<half>();
-  auto float_input = new LoDTensor;
-
-  float_input->mutable_data<float>(
-      {1, input->dims()[1], input->dims()[2], input->dims()[3]});
-  fpga::format_fp32_ofm(float_input);
-
-  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
-  args.input_layout_type = fpga::LAYOUT_HWC;
-  args.output_layout_type = fpga::LAYOUT_CHW;
-  args.input_data_type = fpga::DATA_TYPE_FP16;
-  args.output_data_type = fpga::DATA_TYPE_FP32;
-  args.image.address = input_ptr;
-  args.image.height = (uint32_t)input->dims()[2];
-  args.image.width = (uint32_t)input->dims()[3];
-  args.image.channels = (uint32_t)input->dims()[1];
-  args.output.address = float_input->data<float>();
-  args.output.scale_address = float_input->scale;
-  param->SetFloatInput(float_input);
-  param->SetFpgaArgs(args);
-  return true;
-}
-
-#define EXP_MAX_INPUT 40.0
-template <typename T>
-T Tanh(const T a) {
-  T tmp = -2.0 * a;
-  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  return (2.0 / (1.0 + exp(tmp))) - 1.0;
-}
-template <typename T>
-void tanhFuntor(Tensor *input, Tensor *output) {
-  auto *input_ptr = input->data<T>();
-  auto *output_ptr = output->mutable_data<T>();
-  for (int i = 0; i < input->numel(); i++) {
-    *(output_ptr + i) = Tanh<T>(*(input_ptr + i));
-  }
-}
-template <>
-void TanhKernel<FPGA, float>::Compute(const TanhParam<FPGA> &param) {
-  Tensor *in_x = param.FloatInput();
-  Tensor *out = param.Out();
-
-  fpga::PerformBypass(param.FpgaArgs());
-  fpga::fpga_invalidate(reinterpret_cast<void *>(in_x->data<float>()),
-                        in_x->numel() * sizeof(float));
-  tanhFuntor<float>(in_x, out);
-  fpga::fpga_flush(out->data<float>(), out->memory_size());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
deleted file mode 100644
index cc839a971ee7f827f150ecdfff0bd75e2a8aafe2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fpga/V2/transpose2_kernel.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef TRANSPOSE2_OP
-
-#include "operators/kernel/transpose2_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
-  auto input = param->InputX();
-  auto output = param->Out();
-  auto axis = param->Axis();
-  auto dim = input->dims();
-  output->ShareDataWith(*input);
-
-  auto dim_v = vectorize(dim);
-
-  for (int i = 0; i < axis.size(); i++) {
-    dim_v[i] = dim[axis[i]];
-  }
-  output->Resize(framework::make_ddim(dim_v));
-
-  DLOG << "input: " << input;
-  DLOG << "output: " << output;
-  return true;
-}
-
-template <>
-void Transpose2Kernel<FPGA, float>::Compute(
-    const Transpose2Param<FPGA> &param) {
-  // Transpose2Compute<float>(param);
-  auto input = param.InputX();
-  auto output = param.Out();
-
-  output->Resize({input->dims()[0], output->dims()[1], output->dims()[2],
-                  output->dims()[3]});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/fusion_fc_kernel.h b/mobile/src/operators/kernel/fusion_fc_kernel.h
deleted file mode 100644
index b8086bc66fbef7ec952548a3cb863cfa031c504e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/fusion_fc_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef FUSION_FC_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class FusionFcKernel
-    : public framework::OpKernelBase<DeviceType, FusionFcParam<DeviceType>> {
- public:
-  void Compute(const FusionFcParam<DeviceType>& param);
-  bool Init(FusionFcParam<DeviceType>* param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h
deleted file mode 100644
index bbadb6b54abb3a800729444052ad6095e3384cb1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/grid_sampler_kernel.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef GRID_SAMPLER_OP
-DECLARE_KERNEL(GridSampler, GridSamplerParam);
-#endif  // GRID_SAMPLER_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/gru_kernel.h b/mobile/src/operators/kernel/gru_kernel.h
deleted file mode 100644
index b03b2e3ecb514fdf962bde9c06620fa6e64934df..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/gru_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class GruKernel
-    : public framework::OpKernelBase<DeviceType, GruParam<DeviceType>> {
- public:
-  void Compute(const GruParam<DeviceType>& param);
-  bool Init(GruParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/gru_unit_kernel.h b/mobile/src/operators/kernel/gru_unit_kernel.h
deleted file mode 100644
index bda17cd20504d057e4a824e6ac046cf35d240d9d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/gru_unit_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_UNIT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class GruUnitKernel
-    : public framework::OpKernelBase<DeviceType, GruUnitParam<DeviceType>> {
- public:
-  void Compute(const GruUnitParam<DeviceType>& param);
-  bool Init(GruUnitParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/im2sequence_kernel.h b/mobile/src/operators/kernel/im2sequence_kernel.h
deleted file mode 100644
index b15eb68996a990f6bc770db6940be83a0eea0cbf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/im2sequence_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IM2SEQUENCE_OP
-
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Im2SequenceKernel
-    : public framework::OpKernelBase<DeviceType, Im2SequenceParam<DeviceType>> {
- public:
-  void Compute(const Im2SequenceParam<DeviceType>& param);
-  bool Init(Im2SequenceParam<DeviceType>* para);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/increment_kernel.h b/mobile/src/operators/kernel/increment_kernel.h
deleted file mode 100644
index 43a930c1b9be512714d253db0966ad171ba4068c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/increment_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INCREMENT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class IncrementKernel
-    : public framework::OpKernelBase<DeviceType, IncrementParam<DeviceType>> {
- public:
-  void Compute(const IncrementParam<DeviceType> &param);
-  bool Init(IncrementParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/instancenorm_kernel.h b/mobile/src/operators/kernel/instancenorm_kernel.h
deleted file mode 100644
index 2333d0cc0fc3673e1b96ea742046853f1ce47f66..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/instancenorm_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef INSTANCENORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class InstanceNormKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     InstanceNormParam<DeviceType>> {
- public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/instancenorm_relu_kernel.h b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
deleted file mode 100644
index cb2a0e1f3cb739847cdf4f635de74c223896106b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/im2col.h"
-#include "operators/math/math_function.h"
-#include "operators/math/vol2col.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class InstanceNormReluKernel
-    : public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
- public:
-  void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
-  bool Init(FusionInstanceNormReluParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/is_empty_kernel.h b/mobile/src/operators/kernel/is_empty_kernel.h
deleted file mode 100644
index 0a6806d087b2bc06e2de77b9a133ad599ba9c3e5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/is_empty_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef IS_EMPTY_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class IsEmptyKernel
-    : public framework::OpKernelBase<DeviceType, IsEmptyParam<DeviceType>> {
- public:
-  void Compute(const IsEmptyParam<DeviceType> &param);
-  bool Init(IsEmptyParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/kernels.h b/mobile/src/operators/kernel/kernels.h
deleted file mode 100644
index 668344674c91827775b78b71c4c50ada6d7108b7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/kernels.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef TOP_K_OP
-DECLARE_KERNEL(TopK, TopKParam);
-#endif  // TOP_K_OP
-
-#ifdef CAST_OP
-DECLARE_KERNEL(Cast, CastParam);
-#endif  // CAST_OP
-
-#ifdef LOD_RESET_OP
-DECLARE_KERNEL(LodReset, LodResetParam);
-#endif  // LOD_RESET_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/logical_kernel.h b/mobile/src/operators/kernel/logical_kernel.h
deleted file mode 100644
index b42ae27005212147a7a7467f974a8f10ca4af299..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/logical_kernel.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LOGICAL_AND_OP
-DECLARE_KERNEL(LogicalAnd, LogicalBinaryParam);
-#endif
-
-#ifdef LOGICAL_OR_OP
-DECLARE_KERNEL(LogicalOr, LogicalBinaryParam);
-#endif
-
-#ifdef LOGICAL_NOT_OP
-DECLARE_KERNEL(LogicalNot, LogicalUnaryParam);
-#endif
-
-#ifdef LOGICAL_XOR_OP
-DECLARE_KERNEL(LogicalXor, LogicalBinaryParam);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/lookup_kernel.h b/mobile/src/operators/kernel/lookup_kernel.h
deleted file mode 100644
index 8c29349e737b0fba95688e1ebb8fe893a29b2a4f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/lookup_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class LookupKernel
-    : public framework::OpKernelBase<DeviceType, LookupParam<DeviceType>> {
- public:
-  void Compute(const LookupParam<DeviceType>& param);
-  bool Init(LookupParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/lrn_kernel.h b/mobile/src/operators/kernel/lrn_kernel.h
deleted file mode 100644
index 486c828acab6d24741baae5804f09bc3e850b02f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/lrn_kernel.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef LRN_OP
-
-#include <cmath>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#include "operators/math/math.h"
-#endif
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename T>
-struct LRNFunctor {
-  void operator()(const framework::Tensor &input, framework::Tensor *out, int N,
-                  int C, int H, int W, int n, float k, float alpha,
-                  float beta) {
-    const float *input_ptr = input.data<float>();
-    const int start = -(n - 1) / 2;
-    const int end = start + n;
-    auto out_ptr = out->data<T>();
-
-    const int stride0 = C * H * W;
-    const int stride1 = H * W;
-    const int stride2 = W;
-    framework::Tensor sqr_buffer;
-    auto sqr_buffer_ptr = sqr_buffer.mutable_data<float>(input.dims());
-    std::fill(sqr_buffer_ptr, sqr_buffer_ptr + sqr_buffer.numel(), 0.0);
-
-    for (int a = 0; a < N; a++) {
-#pragma parallel for
-      for (int b = 0; b < C; b++) {
-        for (int index = start; index < end; index++) {
-          int channel = b + index;
-          if (channel >= 0 && channel < C) {
-            int tmp_s = a * stride0 + b * stride1;
-            int tmp_c = a * stride0 + channel * stride1;
-#ifdef __ARM_NEON
-            int n4 = stride1 / 4;
-            int m4 = stride1 % 4;
-            float32x4_t sqr0;
-            float32x4_t in0;
-            float32x4_t res0;
-            for (int i = 0; i < n4; i++) {
-              sqr0 = vld1q_f32(sqr_buffer_ptr + tmp_s);
-              in0 = vld1q_f32(input_ptr + tmp_c);
-
-              res0 = vmlaq_f32(sqr0, in0, in0);
-              vst1q_f32(sqr_buffer_ptr + tmp_s, res0);
-
-              tmp_s += 4;
-              tmp_c += 4;
-            }
-
-            for (int i = 0; i < m4; i++) {
-              int s_i = tmp_s + i;
-              int c_i = tmp_c + i;
-              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
-            }
-
-#else
-            for (int tmp = 0; tmp < stride1; tmp++) {
-              int s_i = tmp_s + tmp;
-              int c_i = tmp_c + tmp;
-              sqr_buffer_ptr[s_i] += input_ptr[c_i] * input_ptr[c_i];
-            }
-#endif
-          }
-        }
-      }
-    }
-
-#ifdef __ARM_NEON
-
-    float32x4_t sqr1, sqr2, sqr3, sqr4;
-    float32x4_t alpha4;
-    float32x4_t k4;
-    float32x4_t beta4;
-    float32x4_t res1, res2, res3, res4;
-    float32x4_t in1, in2, in3, in4;
-
-    beta4 = vdupq_n_f32(beta);
-    alpha4 = vdupq_n_f32(alpha);
-    k4 = vdupq_n_f32(k);
-    auto out_tmp_ptr = out_ptr;
-
-    int n16 = input.numel() / 16;
-    int m16 = input.numel() % 16;
-    int m16n4 = m16 / 4;
-    int m16m4 = m16 % 4;
-
-    for (int i = 0; i < n16; i++) {
-      sqr1 = vld1q_f32(sqr_buffer_ptr);
-      sqr2 = vld1q_f32(sqr_buffer_ptr + 4);
-      sqr3 = vld1q_f32(sqr_buffer_ptr + 8);
-      sqr4 = vld1q_f32(sqr_buffer_ptr + 12);
-
-      in1 = vld1q_f32(input_ptr);
-      in2 = vld1q_f32(input_ptr + 4);
-      in3 = vld1q_f32(input_ptr + 8);
-      in4 = vld1q_f32(input_ptr + 12);
-
-      sqr1 = vmlaq_f32(k4, sqr1, alpha4);
-      sqr2 = vmlaq_f32(k4, sqr2, alpha4);
-      sqr3 = vmlaq_f32(k4, sqr3, alpha4);
-      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
-
-      sqr1 = pow_ps(sqr1, -beta4);
-      sqr2 = pow_ps(sqr2, -beta4);
-      sqr3 = pow_ps(sqr3, -beta4);
-      sqr4 = pow_ps(sqr4, -beta4);
-
-      sqr1 = vmulq_f32(sqr1, in1);
-      sqr2 = vmulq_f32(sqr2, in2);
-      sqr3 = vmulq_f32(sqr3, in3);
-      sqr4 = vmulq_f32(sqr4, in4);
-
-      vst1q_f32(out_tmp_ptr, sqr1);
-      vst1q_f32(out_tmp_ptr + 4, sqr2);
-      vst1q_f32(out_tmp_ptr + 8, sqr3);
-      vst1q_f32(out_tmp_ptr + 12, sqr4);
-
-      sqr_buffer_ptr += 4 * 4;
-      input_ptr += 4 * 4;
-      out_tmp_ptr += 4 * 4;
-    }
-    for (int i = 0; i < m16n4; i++) {
-      sqr4 = vld1q_f32(sqr_buffer_ptr);
-      in4 = vld1q_f32(input_ptr);
-      sqr4 = vmlaq_f32(k4, sqr4, alpha4);
-      sqr4 = pow_ps(sqr4, -beta4);
-      sqr4 = vmulq_f32(sqr4, in4);
-      vst1q_f32(out_tmp_ptr, sqr4);
-      sqr_buffer_ptr += 4;
-      input_ptr += 4;
-      out_tmp_ptr += 4;
-    }
-
-    for (int i = 0; i < m16m4; i++) {
-      out_tmp_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
-    }
-
-#else
-    for (int i = 0; i < input.numel(); i++) {
-      out_ptr[i] = input_ptr[i] / pow(k + alpha * sqr_buffer_ptr[i], beta);
-    }
-#endif
-  }
-};
-
-template <typename DeviceType, typename T>
-class LrnKernel
-    : public framework::OpKernelBase<DeviceType, LrnParam<DeviceType>> {
- public:
-  void Compute(const LrnParam<DeviceType> &param);
-  bool Init(LrnParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/mul_kernel.h b/mobile/src/operators/kernel/mul_kernel.h
deleted file mode 100644
index 8deb4a2cb74786257ddfc12c805c4a7d56589bbf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/mul_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/math/math_function.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using namespace framework;
-
-template <typename DeviceType, typename T>
-class MulKernel
-    : public framework::OpKernelBase<DeviceType, MulParam<DeviceType>> {
- public:
-  void Compute(const MulParam<DeviceType> &param);
-  bool Init(MulParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/multiclass_nms_kernel.h b/mobile/src/operators/kernel/multiclass_nms_kernel.h
deleted file mode 100644
index 6a4ac0c22941aa364f05e38c7abaf29948cd324b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/multiclass_nms_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#pragma once
-
-#include "framework/operator.h"
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class MultiClassNMSKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     MultiClassNMSParam<DeviceType>> {
- public:
-  void Compute(const MultiClassNMSParam<DeviceType>& param);
-  bool Init(MultiClassNMSParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/nearest_interp_kernel.h b/mobile/src/operators/kernel/nearest_interp_kernel.h
deleted file mode 100644
index cb2d186312edba9513aa555b85c429d703c56794..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/nearest_interp_kernel.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class NearestInterpolationKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     NearestInterpolationParam<DeviceType>> {
- public:
-  void Compute(const NearestInterpolationParam<DeviceType>& param);
-  bool Init(NearestInterpolationParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/norm_kernel.h b/mobile/src/operators/kernel/norm_kernel.h
deleted file mode 100644
index 4f945bdb8b03a3952dd362df8b36a1db26f3fd93..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/norm_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class NormKernel
-    : public framework::OpKernelBase<DeviceType, NormParam<DeviceType>> {
- public:
-  void Compute(const NormParam<DeviceType> &param);
-  bool Init(NormParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/one_hot_kernel.h b/mobile/src/operators/kernel/one_hot_kernel.h
deleted file mode 100644
index 2cb2e59eb3c99b1152221de57301024e7fe9dba9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/one_hot_kernel.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class OnehotParam : public OpParam {
- public:
-  OnehotParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = GET_VAR_AS_LOD_TENSOR("X", inputs, *scope);
-    output_ = GET_VAR_AS_LOD_TENSOR("Out", outputs, *scope);
-
-    depth_ = OpParam::GetAttr<int>("depth", attrs);
-    dtype_ = OpParam::GetAttr<int>("dtype", attrs);
-  }
-
- public:
-  framework::LoDTensor *input_;
-  framework::LoDTensor *output_;
-
-  int depth_;
-  int dtype_;
-};
-
-DECLARE_KERNEL(Onehot, OnehotParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/kernel/pad2d_kernel.h b/mobile/src/operators/kernel/pad2d_kernel.h
deleted file mode 100644
index 0d1d1408badc7131a1385efc69206bff4ecc44a6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/pad2d_kernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-// template <typename Dtype>
-// class Pad2DParam : public OpParam {
-// public:
-//  Pad2DParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-//             const AttributeMap &attrs, Scope *scope)
-//      : OpParam(inputs, outputs, attrs, scope) {
-//    input_ = OpParam::GetVarValue<framework::LoDTensor>("X", inputs, *scope);
-//    output_ =
-//        OpParam::GetVarValue<framework::LoDTensor>("Out", outputs, *scope);
-//    paddings_ = OpParam::GetAttr<std::vector<int>>("paddings", attrs);
-//    pad_value_ = OpParam::GetAttr<float>("pad_value", attrs);
-//    mode_ = OpParam::GetStringAttr("mode", attrs);
-//  }
-//
-// public:
-//  framework::LoDTensor *input_;
-//  framework::LoDTensor *output_;
-//  std::vector<int> paddings_;
-//  float pad_value_;
-//  std::string mode_;
-//};
-
-DECLARE_KERNEL(Pad2D, Pad2DParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/kernel/pixel_shuffle_kernel.h b/mobile/src/operators/kernel/pixel_shuffle_kernel.h
deleted file mode 100644
index 3f95c866f893f625194afe127dc83851dd874ff7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/pixel_shuffle_kernel.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef LRN_OP
-
-#include <cmath>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#include "operators/math/math.h"
-#endif
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PixelShuffleKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     PixelShuffleParam<DeviceType>> {
- public:
-  void Compute(const PixelShuffleParam<DeviceType> &param);
-  bool Init(PixelShuffleParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/polygon_box_transform_kernel.h b/mobile/src/operators/kernel/polygon_box_transform_kernel.h
deleted file mode 100644
index 6ed003a4c794e7293ae3506909a779f95a677579..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/polygon_box_transform_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PolygonBoxTransformKernel
-    : public framework::OpKernelBase<DeviceType,
-                                     PolygonBoxTransformParam<DeviceType>> {
- public:
-  void Compute(const PolygonBoxTransformParam<DeviceType>& param);
-  bool Init(PolygonBoxTransformParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/pool_kernel.h b/mobile/src/operators/kernel/pool_kernel.h
deleted file mode 100644
index ff80e0e44536d924026dbbe80a09677c069a8f6b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/pool_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class PoolKernel : public OpKernelBase<DeviceType, PoolParam<DeviceType>> {
- public:
-  void Compute(const PoolParam<DeviceType> &param);
-  bool Init(PoolParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/prelu_kernel.h b/mobile/src/operators/kernel/prelu_kernel.h
deleted file mode 100644
index c043149243f21f2abceeed37c5d0e81a61e5059f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/prelu_kernel.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PReluKernel
-    : public framework::OpKernelBase<DeviceType, PReluParam<DeviceType>> {
- public:
-  void Compute(const PReluParam<DeviceType>& param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/prior_box_kernel.h b/mobile/src/operators/kernel/prior_box_kernel.h
deleted file mode 100644
index c5d561083d13f878b6b46ccd03d4ae3c4d1f233f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/prior_box_kernel.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/math/transform.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef PRIORBOX_OP
-inline void ExpandAspectRatios(const std::vector<float> &input_aspect_ratior,
-                               bool flip,
-                               std::vector<float> *output_aspect_ratior) {
-  constexpr float epsilon = 1e-6;
-  output_aspect_ratior->clear();
-  output_aspect_ratior->push_back(1.0f);
-  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
-    float ar = input_aspect_ratior[i];
-    bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
-      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
-        already_exist = true;
-        break;
-      }
-    }
-    if (!already_exist) {
-      output_aspect_ratior->push_back(ar);
-      if (flip) {
-        output_aspect_ratior->push_back(1.0f / ar);
-      }
-    }
-  }
-}
-
-DECLARE_KERNEL(PriorBox, PriorBoxParam);
-#endif  // PRIORBOX_OP
-
-#ifdef DENSITY_PRIORBOX_OP
-template <typename Dtype>
-class DensityPriorBoxParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  DensityPriorBoxParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    input_image_ = InputImageFrom<GType>(inputs, *scope);
-    output_boxes_ = OutputBoxesFrom<GType>(outputs, *scope);
-    output_variances_ = OutputVariancesFrom<GType>(outputs, *scope);
-    variances_ = GetAttr<vector<float>>("variances", attrs);
-    clip_ = GetAttr<bool>("clip", attrs);
-    flatten_to_2d_ = GetAttr<bool>("flatten_to_2d", attrs);
-    step_w_ = GetAttr<float>("step_w", attrs);
-    step_h_ = GetAttr<float>("step_h", attrs);
-    offset_ = GetAttr<float>("offset", attrs);
-    fixed_sizes_ = GetAttr<vector<float>>("fixed_sizes", attrs);
-    fixed_ratios_ = GetAttr<vector<float>>("fixed_ratios", attrs);
-    densities_ = GetAttr<vector<int>>("densities", attrs);
-  }
-
-  ~DensityPriorBoxParam() {}
-
-  const GType *Input() const { return input_; }
-  const GType *InputImage() const { return input_image_; }
-  GType *OutputBoxes() const { return output_boxes_; }
-  GType *OutputVariances() const { return output_variances_; }
-  const bool Clip() const { return clip_; }
-  const bool FlattenTo2d() const { return flatten_to_2d_; }
-  const float StepW() const { return step_w_; }
-  const float StepH() const { return step_h_; }
-  const float Offset() const { return offset_; }
-  const vector<float> &FixedSizes() const { return fixed_sizes_; }
-  const vector<float> &FixedRatios() const { return fixed_ratios_; }
-  const vector<int> &Densities() const { return densities_; }
-  const vector<float> &Variances() const { return variances_; }
-  GType *getNewDensity() const { return new_density.get(); }
-  void setNewDensity(GType *newDensity) { new_density.reset(newDensity); }
-
- public:
-  GType *input_;
-  GType *input_image_;
-  GType *output_boxes_;
-  GType *output_variances_;
-  bool clip_;
-  bool flatten_to_2d_;
-  float step_w_;
-  float step_h_;
-  float offset_;
-  vector<float> fixed_sizes_;
-  vector<float> fixed_ratios_;
-  vector<int> densities_;
-  vector<float> variances_;
-  std::shared_ptr<GType> new_density;
-};
-
-DECLARE_KERNEL(DensityPriorBox, DensityPriorBoxParam);
-#endif  // DENSITY_PRIORBOX_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/quantize_kernel.h b/mobile/src/operators/kernel/quantize_kernel.h
deleted file mode 100644
index d864e00d9c80003d06d460f85b6fddda40e6d607..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/quantize_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class QuantizeKernel
-    : public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
- public:
-  void Compute(const QuantizeParam<DeviceType> &param);
-  bool Init(QuantizeParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/range_kernel.cpp b/mobile/src/operators/kernel/range_kernel.cpp
deleted file mode 100644
index 9384eb0195e9b0f58804483ef9b7cdb91cd0f62e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/range_kernel.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#include "operators/kernel/range_kernel.h"
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool RangeKernel<CPU, float>::Init(RangeParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void RangeKernel<CPU, float>::Compute(const RangeParam<CPU>& param) {
-  int start = param.Start()->data<int>()[0];
-  int end = param.End()->data<int>()[0];
-  int step = param.Step()->data<int>()[0];
-  auto* out = param.Output();
-
-  int64_t size = 0;
-  GetSize(start, end, step, &size);
-  out->Resize(framework::make_ddim({size}));
-  auto* out_data = out->mutable_data<int>();
-  auto value = start;
-  for (int64_t i = 0; i < size; ++i) {
-    out_data[i] = value;
-    value += step;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // RANGE_OP
diff --git a/mobile/src/operators/kernel/range_kernel.h b/mobile/src/operators/kernel/range_kernel.h
deleted file mode 100644
index 36429461b20668a3b32e757f6b24c5d135b68765..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/range_kernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#pragma once
-
-#include <cmath>
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline void GetSize(float start, float end, float step, int64_t *size) {
-  PADDLE_MOBILE_ENFORCE(!std::equal_to<float>()(step, 0),
-                        "The step of range op should not be 0.");
-  PADDLE_MOBILE_ENFORCE(
-      ((start < end) && (step > 0)) || ((start > end) && (step < 0)),
-      "The step should be greater than 0 while start < end. And the "
-      "step should be less than 0 while start > end.");
-  *size = std::is_integral<float>::value
-              ? ((std::abs(end - start) + std::abs(step) - 1) / std::abs(step))
-              : std::ceil(std::abs((end - start) / step));
-}
-
-template <typename Dtype>
-class RangeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  RangeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    start_ = OpParam::GetVarValue<GType>("Start", inputs, *scope);
-    end_ = OpParam::GetVarValue<GType>("End", inputs, *scope);
-    step_ = OpParam::GetVarValue<GType>("Step", inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  GType *Start() const { return start_; }
-  const GType *End() const { return end_; }
-  const GType *Step() const { return step_; }
-  GType *Output() const { return output_; }
-
- private:
-  GType *start_;
-  GType *end_;
-  GType *step_;
-  GType *output_;
-};
-
-DECLARE_KERNEL(Range, RangeParam);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // RANGE_OP
diff --git a/mobile/src/operators/kernel/reduce_prod_kernel.cpp b/mobile/src/operators/kernel/reduce_prod_kernel.cpp
deleted file mode 100644
index c40e5c4615013b5f3be8ff9260866a55afc1c973..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/reduce_prod_kernel.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#include "operators/kernel/reduce_prod_kernel.h"
-#include <operators/reduce_prod_op.h>
-#include <array>
-#include "framework/data_type.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool ReduceProdKernel<CPU, float>::Init(ReduceProdParam<CPU>* param) {
-  return true;
-}
-
-template <>
-void ReduceProdKernel<CPU, float>::Compute(const ReduceProdParam<CPU>& param) {
-  auto* input = param.Input();
-  if (input->type() == type_id<int>().hash_code()) {
-    bool reduce_all = param.isReduceAll();
-    auto* output = param.Output();
-    auto dim = param.getDim();
-    auto* out_data = output->mutable_data<int>();
-    const auto* input_x_data = input->data<int>();
-
-    auto dims = param.getDim();
-    bool keep_dim = param.isKeepDim();
-
-    if (reduce_all) {
-      size_t stride = 1;
-      for (int j = dim[0]; j < input->dims().size(); ++j) {
-        stride *= input->dims()[j];
-      }
-      auto numel = output->numel();
-      for (int i = 0; i < numel; i++) {
-        int64_t mul = 1;
-        for (int j = 0; j < stride; ++j, ++input_x_data) {
-          mul *= (*input_x_data);
-        }
-        out_data[i] = mul;
-      }
-    } else {
-      // todo
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // REDUCE_PROD_OP
diff --git a/mobile/src/operators/kernel/reduce_prod_kernel.h b/mobile/src/operators/kernel/reduce_prod_kernel.h
deleted file mode 100644
index 73c93fdc0b175092868a2198775d6e21ef42846e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/reduce_prod_kernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype>
-class ReduceProdParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReduceProdParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::InputXFrom<GType>(inputs, *scope);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    reduce_all_ = GetAttr<bool>("reduce_all", attrs);
-    keep_dim_ = GetAttr<bool>("keep_dim", attrs);
-    dim_ = GetAttr<std::vector<int>>("dim", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Output() const { return output_; }
-
-  bool isReduceAll() const { return reduce_all_; }
-
-  bool isKeepDim() const { return keep_dim_; }
-
-  const vector<int> getDim() const { return dim_; }
-
- private:
-  GType *input_;
-  GType *output_;
-  bool reduce_all_;
-  bool keep_dim_;
-  std::vector<int> dim_;
-};
-
-DECLARE_KERNEL(ReduceProd, ReduceProdParam)
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // REDUCE_PROD_OP
diff --git a/mobile/src/operators/kernel/reshape2_kernel.h b/mobile/src/operators/kernel/reshape2_kernel.h
deleted file mode 100644
index c6ab3cf72a29612249d0ff08e56ef60ca30d59a8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/reshape2_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Reshape2Kernel
-    : public framework::OpKernelBase<DeviceType, Reshape2Param<DeviceType>> {
- public:
-  void Compute(const Reshape2Param<DeviceType>& param);
-  bool Init(Reshape2Param<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/reshape_kernel.h b/mobile/src/operators/kernel/reshape_kernel.h
deleted file mode 100644
index a5405654874320cdfe3432d16d3a8c6358d2d8e1..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/reshape_kernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-inline framework::DDim ValidateShape(const std::vector<int> shape,
-                                     const framework::DDim& in_dims) {
-  const int64_t in_size = framework::product(in_dims);
-  // only one dimension can be set to -1, whose size will be automatically
-  // infered.
-  const int64_t unk_dim_val = -1;
-  const int64_t copy_dim_val = 0;
-
-  std::vector<int64_t> output_shape(shape.size(), 0);
-  int64_t capacity = 1;
-  int unk_dim_idx = -1;
-  for (size_t i = 0; i < shape.size(); ++i) {
-    if (shape[i] == unk_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          unk_dim_idx == -1,
-          "Only one input dimension of Attr(shape) can be unknown.");
-      unk_dim_idx = i;
-    } else if (shape[i] == copy_dim_val) {
-      PADDLE_MOBILE_ENFORCE(
-          static_cast<int>(i) < in_dims.size(),
-          "The index of dimension to copy from input shape must be less "
-          "than the size of input shape.");
-    } else {
-      PADDLE_MOBILE_ENFORCE(
-          shape[i] > 0,
-          "Each input dimension of Attr(shape) must not be negtive except "
-          "one unknown dimension.");
-    }
-
-    capacity *= (shape[i] ? shape[i] : in_dims[i]);
-    output_shape[i] = (shape[i] ? static_cast<int64_t>(shape[i]) : in_dims[i]);
-  }
-
-  if (unk_dim_idx != -1) {
-    output_shape[unk_dim_idx] = -in_size / capacity;
-    PADDLE_MOBILE_ENFORCE(output_shape[unk_dim_idx] * capacity == -in_size,
-                          "Invalid shape is given.");
-  } else {
-    PADDLE_MOBILE_ENFORCE(capacity == in_size, "Invalid shape is given.");
-  }
-  return framework::make_ddim(output_shape);
-}
-
-template <typename DeviceType, typename T>
-class ReshapeKernel
-    : public framework::OpKernelBase<DeviceType, ReshapeParam<DeviceType>> {
- public:
-  void Compute(const ReshapeParam<DeviceType>& param);
-  bool Init(ReshapeParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/resize_kernel.h b/mobile/src/operators/kernel/resize_kernel.h
deleted file mode 100644
index b25a0dcef5d291f03e4bb1a127eb0b592ee89055..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/resize_kernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#pragma once
-
-#include <vector>
-#include "framework/operator.h"
-
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType>
-inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) {
-  const auto *input_x = param.InputX();
-  const auto &input_x_dims = input_x->dims();
-  auto *out = param.Out();
-  framework::DDim out_dims = out->dims();
-  const auto *input_shape = param.InputShape();
-
-  if (input_shape) {
-    input_x->dims()[0];
-    auto *shape_data = input_shape->template data<int>();
-    framework::Tensor cpu_shape_tensor;
-    auto shape =
-        std::vector<int>(shape_data, shape_data + input_shape->numel());
-    const int in_batch_size = input_x->dims()[0];
-    const int in_chan_size = input_x->dims()[1];
-    const int in_height = input_x->dims()[2];
-    const int in_width = input_x->dims()[3];
-
-    int out_height = 0;
-    int out_width = 0;
-    bool is_pyramid_test = param.IsPyramidTest();
-    if (is_pyramid_test == false) {
-      out_height = param.Height();
-      out_width = param.Width();
-      PADDLE_MOBILE_ENFORCE(out_height > 0, "output height is required");
-      PADDLE_MOBILE_ENFORCE(out_width > 0, "output width is required");
-
-    } else {
-      float out_height_scale = param.OutHeightScale();
-      float out_width_scale = param.OutWidthScale();
-      PADDLE_MOBILE_ENFORCE(out_height_scale > 0,
-                            "output height scale is required");
-      PADDLE_MOBILE_ENFORCE(out_width_scale > 0,
-                            "output width scale is required");
-
-      out_height = int(out_height_scale * in_height);
-      out_width = int(out_width_scale * in_width);
-    }
-
-    out_dims = framework::make_ddim(
-        {in_batch_size, in_chan_size, in_height, in_width});
-  }
-  return out_dims;
-}
-
-template <typename DeviceType, typename T>
-class ResizeKernel
-    : public framework::OpKernelBase<DeviceType, ResizeParam<DeviceType>> {
- public:
-  void Compute(const ResizeParam<DeviceType> &param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/scale_kernel.h b/mobile/src/operators/kernel/scale_kernel.h
deleted file mode 100644
index 4b0c8f457cc32809007bef3268df9fa6c024a283..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/scale_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ScaleKernel
-    : public framework::OpKernelBase<DeviceType, ScaleParam<DeviceType>> {
- public:
-  void Compute(const ScaleParam<DeviceType>& param);
-  bool Init(ScaleParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/sequence_kernels.h b/mobile/src/operators/kernel/sequence_kernels.h
deleted file mode 100644
index ccee8c521690888257092c7c457534ae2149d9d0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/sequence_kernels.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef SEQUENCE_EXPAND_OP
-DECLARE_KERNEL(SequenceExpand, SequenceExpandParam);
-#endif  // SEQUENCE_EXPAND_OP
-
-#ifdef SEQUENCE_POOL_OP
-DECLARE_KERNEL(SequencePool, SequencePoolParam);
-#endif  // SEQUENCE_POOL_OP
-
-#ifdef SEQUENCE_SOFTMAX_OP
-DECLARE_KERNEL(SequenceSoftmax, SoftmaxParam);
-#endif  // SEQUENCE_SOFTMAX_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/shape_kernel.h b/mobile/src/operators/kernel/shape_kernel.h
deleted file mode 100644
index 9d3c6e1701523acc43410fb0e3402b5679d4f19a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/shape_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class ShapeKernel
-    : public framework::OpKernelBase<DeviceType, ShapeParam<DeviceType>> {
- public:
-  void Compute(const ShapeParam<DeviceType>& param);
-  bool Init(ShapeParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/slice_kernel.h b/mobile/src/operators/kernel/slice_kernel.h
deleted file mode 100644
index 89dba51d9e11570bd4228adb075ee104b2094fd8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/slice_kernel.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SliceKernel
-    : public framework::OpKernelBase<DeviceType, SliceParam<DeviceType>> {
- public:
-  void Compute(const SliceParam<DeviceType>& param);
-  bool Init(SliceParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/softmax_kernel.h b/mobile/src/operators/kernel/softmax_kernel.h
deleted file mode 100644
index d7d7435fd5145e702de848872f93087188fd31fc..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/softmax_kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class SoftmaxKernel
-    : public OpKernelBase<DeviceType, SoftmaxParam<DeviceType>> {
- public:
-  void Compute(const SoftmaxParam<DeviceType> &param);
-  bool Init(SoftmaxParam<DeviceType> *param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/split_kernel.h b/mobile/src/operators/kernel/split_kernel.h
deleted file mode 100644
index 3a2c03dce718e650ebf9127044f0db44d9d5c9a5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/split_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SplitKernel
-    : public framework::OpKernelBase<DeviceType, SplitParam<DeviceType>> {
- public:
-  void Compute(const SplitParam<DeviceType>& param);
-  bool Init(SplitParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/sum_kernel.h b/mobile/src/operators/kernel/sum_kernel.h
deleted file mode 100644
index 967d6f8307beb90254c431beaf324e891898d1a0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/sum_kernel.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#pragma once
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SumKernel
-    : public framework::OpKernelBase<DeviceType, SumParam<DeviceType>> {
- public:
-  void Compute(const SumParam<DeviceType> &param);
-  bool Init(SumParam<DeviceType> *param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/tanh_kernel.h b/mobile/src/operators/kernel/tanh_kernel.h
deleted file mode 100644
index 035f64f840b0aae8970f1aa284054a7984fc7ed6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/tanh_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef TANH_OP
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::OpKernelBase;
-
-template <typename DeviceType, typename T>
-class TanhKernel : public OpKernelBase<DeviceType, TanhParam<DeviceType>> {
- public:
-  void Compute(const TanhParam<DeviceType>& param);
-  bool Init(TanhParam<DeviceType>* param);
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h b/mobile/src/operators/kernel/tensor_array_read_write_kernel.h
deleted file mode 100644
index 8b666c0b40e79bde9c2f101e69f4e5b58aa32d45..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/tensor_array_read_write_kernel.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WRITE_TO_ARRAY_OP
-DECLARE_KERNEL(WriteToArray, WriteToArrayParam);
-#endif  // WRITE_TO_ARRAY_OP
-
-#ifdef READ_FROM_ARRAY_OP
-DECLARE_KERNEL(ReadFromArray, ReadFromArrayParam);
-#endif  // READ_FROM_ARRAY_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/transpose2_kernel.h b/mobile/src/operators/kernel/transpose2_kernel.h
deleted file mode 100644
index a1fb186db09520bed6f891ef9381d96a06f648c9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/transpose2_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class Transpose2Kernel
-    : public framework::OpKernelBase<DeviceType, Transpose2Param<DeviceType>> {
- public:
-  void Compute(const Transpose2Param<DeviceType>& param);
-  bool Init(Transpose2Param<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/transpose_kernel.h b/mobile/src/operators/kernel/transpose_kernel.h
deleted file mode 100644
index 63ee6eb172ff691ff51dd3f74613cd3e412210bf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/transpose_kernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-
-#pragma once
-
-#include <vector>
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class TransposeKernel
-    : public framework::OpKernelBase<DeviceType, TransposeParam<DeviceType>> {
- public:
-  void Compute(const TransposeParam<DeviceType>& param);
-  bool Init(TransposeParam<DeviceType>* param);
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/kernel/while_kernel.h b/mobile/src/operators/kernel/while_kernel.h
deleted file mode 100644
index 6882ef047f4ec7be15ad8fc16a479f06f3563947..0000000000000000000000000000000000000000
--- a/mobile/src/operators/kernel/while_kernel.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/operator.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef WHILE_OP
-template <typename Dtype>
-class WhileParam : public OpParam {
- public:
-  WhileParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : scope_(scope), OpParam(inputs, outputs, attrs, scope) {
-    cond_ =
-        OpParam::GetVarValue<framework::LoDTensor>("Condition", inputs, *scope);
-    sub_block_ = OpParam::GetAttr<framework::BlockDesc *>("sub_block", attrs);
-    is_test = OpParam::GetAttr<bool>("is_test", attrs);
-  }
-
- public:
-  Scope *scope_;
-  framework::LoDTensor *cond_;
-  framework::BlockDesc *sub_block_;
-  bool is_test;
-};
-
-DECLARE_KERNEL(While, WhileParam);
-#endif  // WHILE_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/lod_reset_op.cpp b/mobile/src/operators/lod_reset_op.cpp
deleted file mode 100644
index c4100ba8d79eae537ed4d38fd08178fd4252c915..0000000000000000000000000000000000000000
--- a/mobile/src/operators/lod_reset_op.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOD_RESET_OP
-
-#include "operators/lod_reset_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void LodResetOp<Dtype, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_x_->dims();
-  this->param_.output_->Resize(input_dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    if (this->param_.append) {
-      this->param_.output_->set_lod(this->param_.input_x_->lod());
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(lod_reset, ops::LodResetOp);
-#endif
-
-#endif  // LOD_RESET_OP
diff --git a/mobile/src/operators/lod_reset_op.h b/mobile/src/operators/lod_reset_op.h
deleted file mode 100644
index 46932dcfabd1c59b26bf0f1a21b7e927fc5aba2e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/lod_reset_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOD_RESET_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(LodReset, LodResetParam, LodResetKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // LOD_RESET_OP
diff --git a/mobile/src/operators/logical_op.cpp b/mobile/src/operators/logical_op.cpp
deleted file mode 100644
index 6478516be088960be8f91e106c21129c8de774f5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/logical_op.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#define DEFINE_LOGICAL_INFERSHAPE(OpName)                   \
-  template <typename Dtype, typename T>                     \
-  void OpName##Op<Dtype, T>::InferShape() const {           \
-    const auto &input_dims = this->param_.InputX()->dims(); \
-    this->param_.Out()->Resize(input_dims);                 \
-  }
-
-#ifdef LOGICAL_AND_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalAnd);
-#endif  // TLOGICAL_AND_OP
-
-#ifdef LOGICAL_OR_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalOr);
-#endif  // TLOGICAL_OR_OP
-
-#ifdef LOGICAL_NOT_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalNot);
-#endif  // LOGICAL_NOT_OP
-
-#ifdef LOGICAL_XOR_OP
-DEFINE_LOGICAL_INFERSHAPE(LogicalXor);
-#endif  // TLOGICAL_XOR_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef LOGICAL_AND_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_and, ops::LogicalAndOp);
-#endif
-#endif  // LOGICAL_AND_OP
-
-#ifdef LOGICAL_OR_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_or, ops::LogicalOrOp);
-#endif
-#endif  // LOGICAL_OR_OP
-
-#ifdef LOGICAL_NOT_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_not, ops::LogicalNotOp);
-#endif
-#endif  // LOGICAL_NOT_OP
-
-#ifdef LOGICAL_XOR_OP
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(logical_xor, ops::LogicalXorOp);
-#endif
-#endif  // LOGICAL_XOR_OP
diff --git a/mobile/src/operators/logical_op.h b/mobile/src/operators/logical_op.h
deleted file mode 100644
index a3cd2fb605f3f081e06eee740d9c47873a29ca97..0000000000000000000000000000000000000000
--- a/mobile/src/operators/logical_op.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/logical_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef LOGICAL_AND_OP
-DECLARE_OPERATOR(LogicalAnd, LogicalBinaryParam, LogicalAndKernel);
-#endif
-
-#ifdef LOGICAL_OR_OP
-DECLARE_OPERATOR(LogicalOr, LogicalBinaryParam, LogicalOrKernel);
-#endif
-
-#ifdef LOGICAL_NOT_OP
-DECLARE_OPERATOR(LogicalNot, LogicalUnaryParam, LogicalNotKernel);
-#endif
-
-#ifdef LOGICAL_XOR_OP
-DECLARE_OPERATOR(LogicalXor, LogicalBinaryParam, LogicalXorKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/lookup_op.cpp b/mobile/src/operators/lookup_op.cpp
deleted file mode 100644
index 682e71221e7bc7d207294fffcf4b289369b90565..0000000000000000000000000000000000000000
--- a/mobile/src/operators/lookup_op.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/lookup_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void LookupOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputW() != nullptr,
-                        "Input(W) of LookupTableOp should not be null.");
-  auto *ids_t = this->param_.InputIds();
-
-  PADDLE_MOBILE_ENFORCE(ids_t != nullptr,
-                        "Input(Ids) of LookupTableOp should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output(Out) of LookupTableOp should not be null.");
-  //    this->param__.InputW()->
-
-  auto table_dims = this->param_.InputW()->dims();
-  auto ids_dims = ids_t->dims();
-
-  int ids_rank = ids_dims.size();
-
-  PADDLE_MOBILE_ENFORCE(table_dims.size() == 2,
-                        "table_dims.size()==2 check failed");
-
-  PADDLE_MOBILE_ENFORCE(ids_dims[ids_rank - 1] == 1,
-                        "The last dimension of the 'Ids' tensor must be 1.");
-
-  auto output_dims =
-      framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
-  output_dims.push_back(table_dims[1]);
-
-  this->param_.Out()->Resize(framework::make_ddim(output_dims));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(lookup_table, ops::LookupOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/lookup_op.h b/mobile/src/operators/lookup_op.h
deleted file mode 100644
index e99936a71146328d77853ad88cd7a3fc5d4faf13..0000000000000000000000000000000000000000
--- a/mobile/src/operators/lookup_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LOOKUP_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/lookup_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class LookupOp : public framework::OperatorWithKernel<
-                     DeviceType, LookupParam<DeviceType>,
-                     operators::LookupKernel<DeviceType, T>> {
- public:
-  LookupOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, LookupParam<DeviceType>,
-                                      operators::LookupKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/lrn_op.cpp b/mobile/src/operators/lrn_op.cpp
deleted file mode 100644
index 9b0745b113d3d362a0a5dc421862d82e3f611c9a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/lrn_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#include "operators/lrn_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void LrnOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(lrn, ops::LrnOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/lrn_op.h b/mobile/src/operators/lrn_op.h
deleted file mode 100644
index dde4b968af7481f2c8ffaff5542d0fceecc06825..0000000000000000000000000000000000000000
--- a/mobile/src/operators/lrn_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef LRN_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/lrn_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class LrnOp : public framework::OperatorWithKernel<
-                  DeviceType, LrnParam<DeviceType>,
-                  operators::LrnKernel<DeviceType, T>> {
- public:
-  LrnOp(const string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, LrnParam<DeviceType>,
-                                      operators::LrnKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/activation.h b/mobile/src/operators/math/activation.h
deleted file mode 100644
index d2b465c2bcf9430b59f76afd8b44c55d7973f686..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/activation.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <string>
-#include "common/enforce.h"
-#include "common/types.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#include "operators/math/math.h"
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-#define EXP_MAX_INPUT 40.0
-
-inline ActivationType GetActivationType(const std::string &type) {
-  if (type == "sigmoid") {
-    return ActivationType::SIGMOID;
-  } else if (type == "relu") {
-    return ActivationType::RELU;
-  } else if (type == "tanh") {
-    return ActivationType::TANH;
-  } else if (type == "identity" || type == "") {
-    return ActivationType::IDENTITY;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
-}
-
-inline ActivationType GetActivationType(const int type) {
-  if (type == 0) {
-    return ActivationType::IDENTITY;
-  } else if (type == 1) {
-    return ActivationType::SIGMOID;
-  } else if (type == 2) {
-    return ActivationType::TANH;
-  } else if (type == 3) {
-    return ActivationType::RELU;
-  }
-  PADDLE_MOBILE_THROW_EXCEPTION("Not support activation type.");
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <ActivationType Act = IDENTITY>
-inline float32x4_t vActiveq_f32(const float32x4_t &x) {
-  return x;
-}
-
-template <>
-inline float32x4_t vActiveq_f32<RELU>(const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  return vmaxq_f32(x, __zero);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  float32x4_t __six = vdupq_n_f32(6.f);
-  return vminq_f32(vmaxq_f32(x, __zero), __six);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<SIGMOID>(const float32x4_t &x) {
-  float32x4_t __one = vdupq_n_f32(1.f);
-  float32x4_t __x = vnegq_f32(x);
-  __x = exp_ps(__x);
-  __x = vaddq_f32(__x, __one);
-  float32x4_t __out = vrecpeq_f32(__x);
-  return vmulq_f32(vrecpsq_f32(__x, __out), __out);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<TANH>(const float32x4_t &x) {
-  float32x4_t __one = vdupq_n_f32(1.f);
-  float32x4_t __x = vnegq_f32(x);
-  __x = vmulq_n_f32(__x, 2.f);
-  __x = exp_ps(__x);
-  __x = vaddq_f32(__x, __one);
-  float32x4_t __out = vrecpeq_f32(__x);
-  __out = vmulq_f32(vrecpsq_f32(__x, __out), __out);
-  __out = vmulq_n_f32(__out, 2.f);
-  return vsubq_f32(__out, __one);
-}
-
-template <>
-inline float32x4_t vActiveq_f32<LOG>(const float32x4_t &x) {
-  return log_ps(x);
-}
-
-template <ActivationType Act = IDENTITY>
-inline float32x4_t vActiveq_f32(const float32x4_t &x,
-                                const float32x4_t &alpha) {
-  return x;
-}
-
-template <>
-inline float32x4_t vActiveq_f32<LEAKY_RELU>(const float32x4_t &x,
-                                            const float32x4_t &alpha) {
-  return vmaxq_f32(x, vmulq_f32(x, alpha));
-}
-
-template <>
-inline float32x4_t vActiveq_f32<RELU6>(const float32x4_t &x,
-                                       const float32x4_t &alpha) {
-  float32x4_t __zero = vdupq_n_f32(0.f);
-  float32x4_t __threshold = vdupq_n_f32(vgetq_lane_f32(alpha, 0));
-  return vminq_f32(vmaxq_f32(x, __zero), __threshold);
-}
-#endif
-
-template <ActivationType Act = IDENTITY>
-inline float Active(const float &x) {
-  return x;
-}
-
-template <ActivationType Act = IDENTITY>
-inline int Active(const int &x) {
-  return x;
-}
-
-template <>
-inline float Active<RELU>(const float &x) {
-  return std::max(x, 0.f);
-}
-
-template <>
-inline float Active<RELU6>(const float &x) {
-  return std::min(std::max(x, 0.f), 6.f);
-}
-
-template <>
-inline float Active<SIGMOID>(const float &x) {
-  //  float tmp = x > SIGMOID_THRESHOLD_MAX ? SIGMOID_THRESHOLD_MAX : x;
-  //  tmp = x > SIGMOID_THRESHOLD_MIN ? x : SIGMOID_THRESHOLD_MIN;
-  //  return 1.f / (1.f + exp(-tmp));
-  return 1.f / (1.f + exp(-x));
-}
-
-template <>
-inline float Active<TANH>(const float &x) {
-  //  float tmp = -2.f * x;
-  //  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-  //  return (2.f / (1.f + exp(tmp))) - 1.f;
-  return 2.f / (1.f + exp(-2.f * x)) - 1.f;
-}
-
-template <>
-inline float Active<LOG>(const float &x) {
-  return log(x);
-}
-
-template <ActivationType Act = IDENTITY>
-inline float Active(const float &x, const float &alpha) {
-  return x;
-}
-
-template <>
-inline float Active<LEAKY_RELU>(const float &x, const float &alpha) {
-  return std::max(x, alpha * x);
-}
-
-template <>
-inline float Active<RELU6>(const float &x, const float &alpha) {
-  return std::min(std::max(x, 0.f), alpha);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
deleted file mode 100644
index 25011b9f01d508b77e213d5aeeb80769f1a85868..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-namespace depthwise {
-
-void conv_depthwise_3x3p1(const float* din, float* dout, int num, int ch_out,
-                          int h_out, int w_out, int ch_in, int h_in, int w_in,
-                          const float* weights, const float* bias, int stride,
-                          bool flag_bias, bool flag_relu);
-
-}  // namespace depthwise
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp b/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
deleted file mode 100644
index 4f3bebd9bf25e98795f1d28ccb2f8a654851f009..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise/faster_depthwise_conv3x3p1.cpp
+++ /dev/null
@@ -1,2011 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#include "framework/context.h"
-#include "operators/math/depthwise/faster_depthwise_conv3x3.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-namespace depthwise {
-
-void conv_depthwise_3x3s1p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, const float *din,
-                                        const float *weights, const float *bias,
-                                        bool flag_bias, const int num,
-                                        const int ch_in, const int h_in,
-                                        const int w_in, const int h_out,
-                                        const int w_out);
-
-void conv_depthwise_3x3s2p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out);
-
-void conv_depthwise_3x3p1(const float *din, float *dout, int num, int ch_out,
-                          int h_out, int w_out, int ch_in, int h_in, int w_in,
-                          const float *weights, const float *bias, int stride,
-                          bool flag_bias, bool flag_relu) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias_relu(dout, din, weights, bias, flag_bias,
-                                         num, ch_in, h_in, w_in, h_out, w_out);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu(dout, din, weights, bias, flag_bias,
-                                           num, ch_in, h_in, w_in, h_out,
-                                           w_out);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias_relu(dout, din, weights, bias, flag_bias,
-                                         num, ch_in, h_in, w_in, h_out, w_out);
-      }
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int tile_h = (h_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-  float *zero_ptr = static_cast<float *>(
-      framework::CPUContext::Context()->get_work_space(w_in * sizeof(float)));
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-  int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float *dout_ptr = dout_batch + c * size_out_channel;
-
-      const float *din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float *wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float *doutr0 = dout_ptr;
-      float *doutr1 = doutr0 + w_out;
-      float *doutr2 = doutr1 + w_out;
-      float *doutr3 = doutr2 + w_out;
-
-      const float *dr0 = din_ch_ptr;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-      const float *dr5 = dr4 + w_in;
-
-      const float *din_ptr0 = dr0;
-      const float *din_ptr1 = dr1;
-      const float *din_ptr2 = dr2;
-      const float *din_ptr3 = dr3;
-      const float *din_ptr4 = dr4;
-      const float *din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt), [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1), [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3), [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5), [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1), [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0), [w1] "w"(wr1), [w2] "w"(wr2),
-              [bias_val] "r"(vbias), [vmask] "r"(vmask), [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float *din_channel = din_batch + i * size_in_channel;
-
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float *dout_channel = dout_batch + i * size_out_channel;
-
-      const float *dr0 = din_channel;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-
-      const float *din0_ptr = nullptr;
-      const float *din1_ptr = nullptr;
-      const float *din2_ptr = nullptr;
-      const float *din3_ptr = nullptr;
-
-      float *doutr0 = nullptr;
-      float *doutr1 = nullptr;
-
-      float *ptr_zero = const_cast<float *>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-                                                                           // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-                                                                           // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-                                                                           // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0), [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr), [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt), [rmask] "+r"(rmask_ptr), [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val), [vzero] "w"(vzero)
-            : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-              "q12", "q13", "q14", "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias_relu(float *dout, const float *din,
-                                      const float *weights, const float *bias,
-                                      bool flag_bias, const int num,
-                                      const int ch_in, const int h_in,
-                                      const int w_in, const int h_out,
-                                      const int w_out) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float *zero_ptr = static_cast<float *>(
-      framework::CPUContext::Context()->get_work_space(w_in * sizeof(float)));
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float *din_channel = din_batch + i * size_in_channel;
-      float *dout_channel = dout_batch + i * size_out_channel;
-
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float *dr0 = din_channel;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-
-      const float *din0_ptr = dr0;
-      const float *din1_ptr = dr1;
-      const float *din2_ptr = dr2;
-      const float *din3_ptr = dr3;
-      const float *din4_ptr = dr4;
-
-      float *doutr0 = dout_channel;
-      float *doutr0_ptr = nullptr;
-      float *doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr), [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr), [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr), [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr), [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero), [w0] "w"(wr0), [w1] "w"(wr1), [w2] "w"(wr2),
-              [remain] "r"(cnt_remain), [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2), [wmask] "w"(wmask), [vbias] "w"(wbias)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "v17", "v18", "v19", "v20", "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-
-        unsigned int *mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr), [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr), [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt), [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain), [wr0] "w"(wr0), [wr1] "w"(wr1),
-              [wr2] "w"(wr2), [bias] "r"(bias_c)
-            : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s_relu(float *dout, const float *din,
-                                        const float *weights, const float *bias,
-                                        bool flag_bias, const int num,
-                                        const int ch_in, const int h_in,
-                                        const int w_in, const int h_out,
-                                        const int w_out) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float *dout_channel = dout_batch + i * size_out_channel;
-      const float *din_channel = din_batch + i * size_in_channel;
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float *doutr0 = dout_channel;
-      float *doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float *dr0 = din_channel + hs * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "fmax v12.4s, v12.4s, %[zero].4s\n"  // out1 -> relu
-            "fmax v13.4s, v13.4s, %[zero].4s\n"  // out2 -> relu
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0), [din1] "+r"(dr1), [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), [zero] "w"(vzero),
-              [mask] "w"(vmask_rp), [bias] "w"(wbias), [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmax.f32 q14, q14, %q[zero]\n"  // out1 -> relu
-            "vmax.f32 q15, q15, %q[zero]\n"  // out2 -> relu
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0), [din1] "+r"(dr1), [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0), [wr1] "w"(wr1), [wr2] "w"(wr2), [zero] "w"(vzero),
-              [mask] "w"(vmask_rp), [bias] "w"(wbias), [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
-              "q13", "q14", "q15");
-#endif  //__aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        };
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-}  // namespace depthwise
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp
deleted file mode 100644
index 4f8b7a7b3000a9130dac1c755a3beb16e7c98c59..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ /dev/null
@@ -1,1062 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "operators/math/depthwise_conv3x3.h"
-#include <arm_neon.h>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifndef __aarch64__
-inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) {
-  float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
-  float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
-  return vcombine_f32(sum0, sum1);
-}
-#endif
-
-template <int Stride = 1>
-inline void Depth3x3NormalRowLoadInput(const float *input, float32x4_t *y) {
-  y[0] = vld1q_f32(input);
-  y[2] = vld1q_f32(input + 4);
-  y[1] = vextq_f32(y[0], y[2], 1);
-  y[2] = vextq_f32(y[0], y[2], 2);
-}
-
-template <>
-inline void Depth3x3NormalRowLoadInput<2>(const float *input, float32x4_t *y) {
-  float32x4x2_t x = vld2q_f32(input);
-  y[0] = x.val[0];
-  y[1] = x.val[1];
-  y[2] = vextq_f32(y[0], y[0], 1);
-  y[2] = vsetq_lane_f32(input[8], y[2], 3);
-}
-
-#define DEPTHWISE_CONV3X3_NORMAL_BORDER(start, end)                      \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 3;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    float value = 0;                                                     \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv3x3NormalRow(const float *input, const float *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      float *output, float32x4_t *ker) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 3;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  int valid_w_end = (input_w + padding_w - 3) / Stride_w + 1;
-  if (valid_w_end < valid_w_start) {
-    valid_w_end = valid_w_start;
-  }
-  // const int valid_w_end = output_w - valid_w_start;
-  float *output_ptr = output + h_output * output_w;
-  // border left
-  DEPTHWISE_CONV3X3_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) >> 2;
-  float32x4_t _sum, _x[3];
-  // valid w
-  for (int w = 0; w < output_tiles * 4; w += 4) {
-    _sum = vdupq_n_f32(0.f);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth3x3NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_lane_f32(_sum, _x[0], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_high_f32(ker[index]), 0);
-    }
-    vst1q_f32(output_ptr + output_offset, _sum);
-  }
-  // remain valid w
-  int remain = (valid_w_end - valid_w_start) & 0x3;
-  if (remain > 0) {
-    _sum = vdupq_n_f32(0.f);
-    int remain_start = valid_w_start + (output_tiles << 2);
-    int input_w_offset = remain_start * Stride_w - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth3x3NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_lane_f32(_sum, _x[0], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_high_f32(ker[index]), 0);
-    }
-    switch (remain) {
-      case 3:
-        vst1q_lane_f32(output_ptr0 + 2, _sum, 2);
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(_sum));
-        break;
-      case 1:
-        vst1q_lane_f32(output_ptr0, _sum, 0);
-        break;
-    }
-  }
-  // border right
-  DEPTHWISE_CONV3X3_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv3x3S1<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {
-  const float *input_data = input.data<float>();
-  const float *filter_data = filter.data<float>();
-  float *out_data = output->mutable_data<float>();
-
-  const int input_h = input.dims()[2];
-  const int input_w = input.dims()[3];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-  const int image_size = input_h * input_w;
-  const int out_image_size = output_h * output_w;
-  const int valid_h_start = padding_h;
-  const int valid_h_end = output_h - valid_h_start;
-  const int valid_h =
-      valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0;
-  const int valid_w_start = padding_w;
-  const int valid_w_end = output_w - valid_w_start;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const float *input_ptr = input_data + g * image_size;
-    const float *filter_ptr = filter_data + g * 9;
-    float *output_ptr = out_data + g * out_image_size;
-
-    const float *filter_ptr0 = filter_ptr;
-    const float *filter_ptr1 = filter_ptr0 + 3;
-    const float *filter_ptr2 = filter_ptr1 + 3;
-    float32x4_t _ker[3];
-    _ker[0] = vld1q_f32(filter_ptr0);
-    _ker[1] = vld1q_f32(filter_ptr1);
-    _ker[2] = vld1q_f32(filter_ptr2);
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-
-    // output 2x6
-    int output_w_tiles = valid_w / 6;
-    int output_w_remain = valid_w - output_w_tiles * 6;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      float *output_ptr0 = output_ptr + h * output_w;
-      float *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        row0 = vextq_f32(zero, row0, 3);
-        row1 = vextq_f32(zero, row1, 3);
-        row2 = vextq_f32(zero, row2, 3);
-        row3 = vextq_f32(zero, row3, 3);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0.f;
-            output_ptr1[w] = 0.f;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc0 = vextq_f32(acc0, acc0, 1);
-            acc1 = vmulq_f32(row1, _ker[0]);
-            acc1 = vmlaq_f32(acc1, row2, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[2]);
-            acc1 = vextq_f32(acc1, acc1, 1);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc0), vget_low_f32(acc1));
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-            vst1_lane_f32(output_ptr1 + w, sum, 1);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-            row3 = vextq_f32(zero, row3, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _result1, _result2, _result3;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _result2 = vmulq_lane_f32(_row10, vget_low_f32(_ker[0]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[0]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[0]), 0);
-        _result3 = vmulq_lane_f32(_row11, vget_low_f32(_ker[0]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[0]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-        _row10 = vld1q_f32(input_ptr3);
-        _row11 = vld1q_f32(input_ptr3 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        _result2 = vmlaq_lane_f32(_result2, _row00, vget_low_f32(_ker[1]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[1]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row01, vget_low_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[1]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result2 = vmlaq_lane_f32(_result2, _row10, vget_low_f32(_ker[2]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[2]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row11, vget_low_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[2]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-        vst1_f32(output_ptr0 + 4, vget_low_f32(_result1));
-        vst1q_f32(output_ptr1, _result2);
-        vst1_f32(output_ptr1 + 4, vget_low_f32(_result3));
-
-        input_ptr0 += 6;
-        input_ptr1 += 6;
-        input_ptr2 += 6;
-        input_ptr3 += 6;
-        output_ptr0 += 6;
-        output_ptr1 += 6;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _result2 = vmulq_lane_f32(_row10, vget_low_f32(_ker[0]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[0]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[0]), 0);
-        _result3 = vmulq_lane_f32(_row11, vget_low_f32(_ker[0]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[0]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-        _row10 = vld1q_f32(input_ptr3);
-        _row11 = vld1q_f32(input_ptr3 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        _result2 = vmlaq_lane_f32(_result2, _row00, vget_low_f32(_ker[1]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[1]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row01, vget_low_f32(_ker[1]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[1]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result2 = vmlaq_lane_f32(_result2, _row10, vget_low_f32(_ker[2]), 0);
-        _result2 = vmlaq_lane_f32(_result2, _ext01, vget_low_f32(_ker[2]), 1);
-        _result2 = vmlaq_lane_f32(_result2, _ext02, vget_high_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _row11, vget_low_f32(_ker[2]), 0);
-        _result3 = vmlaq_lane_f32(_result3, _ext03, vget_low_f32(_ker[2]), 1);
-        _result3 = vmlaq_lane_f32(_result3, _ext04, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 5:
-            vst1q_lane_f32(output_ptr0 + 4, _result1, 0);
-            vst1q_lane_f32(output_ptr1 + 4, _result3, 0);
-          case 4:
-            vst1q_f32(output_ptr0, _result0);
-            vst1q_f32(output_ptr1, _result2);
-            break;
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-            vst1q_lane_f32(output_ptr1 + 2, _result2, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            vst1_f32(output_ptr1, vget_low_f32(_result2));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            vst1q_lane_f32(output_ptr1, _result2, 0);
-            break;
-        }
-
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        input_ptr3 += output_w_remain;
-        output_ptr0 += output_w_remain;
-        output_ptr1 += output_w_remain;
-      }
-      // pad right
-      if (padding_w) {
-        float32x2_t row0 = vld1_f32(input_ptr0);
-        float32x2_t row1 = vld1_f32(input_ptr1);
-        float32x2_t row2 = vld1_f32(input_ptr2);
-        float32x2_t row3 = vld1_f32(input_ptr3);
-        float32x2_t zero = vdup_n_f32(0.f);
-        float32x2_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0.f;
-            *output_ptr1 = 0.f;
-          } else {
-            acc0 = vmul_f32(row0, vget_low_f32(_ker[0]));
-            acc0 = vmla_f32(acc0, row1, vget_low_f32(_ker[1]));
-            acc0 = vmla_f32(acc0, row2, vget_low_f32(_ker[2]));
-            acc1 = vmul_f32(row1, vget_low_f32(_ker[0]));
-            acc1 = vmla_f32(acc1, row2, vget_low_f32(_ker[1]));
-            acc1 = vmla_f32(acc1, row3, vget_low_f32(_ker[2]));
-            float32x2_t sum = vpadd_f32(acc0, acc1);
-            vst1_lane_f32(output_ptr0, sum, 0);
-            vst1_lane_f32(output_ptr1, sum, 1);
-            row0 = vext_f32(row0, zero, 1);
-            row1 = vext_f32(row1, zero, 1);
-            row2 = vext_f32(row2, zero, 1);
-            row3 = vext_f32(row3, zero, 1);
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      float *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        row0 = vextq_f32(zero, row0, 3);
-        row1 = vextq_f32(zero, row1, 3);
-        row2 = vextq_f32(zero, row2, 3);
-        float32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0.f;
-          } else {
-            acc = vmulq_f32(row0, _ker[0]);
-            acc = vmlaq_f32(acc, row1, _ker[1]);
-            acc = vmlaq_f32(acc, row2, _ker[2]);
-            acc = vextq_f32(acc, acc, 1);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_low_f32(acc));
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _result1;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-        vst1_f32(output_ptr0 + 4, vget_low_f32(_result1));
-
-        input_ptr0 += 6;
-        input_ptr1 += 6;
-        input_ptr2 += 6;
-        output_ptr0 += 6;
-      }
-
-      if (output_w_remain > 0) {
-        float32x4_t _row00 = vld1q_f32(input_ptr0);
-        float32x4_t _row01 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _row10 = vld1q_f32(input_ptr1);
-        float32x4_t _row11 = vld1q_f32(input_ptr1 + 4);
-
-        float32x4_t _ext01 = vextq_f32(_row00, _row01, 1);
-        float32x4_t _ext02 = vextq_f32(_row00, _row01, 2);
-        float32x4_t _ext03 = vextq_f32(_row01, _row01, 1);
-        float32x4_t _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmulq_lane_f32(_row00, vget_low_f32(_ker[0]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[0]), 0);
-        _result1 = vmulq_lane_f32(_row01, vget_low_f32(_ker[0]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[0]), 0);
-
-        _ext01 = vextq_f32(_row10, _row11, 1);
-        _ext02 = vextq_f32(_row10, _row11, 2);
-        _ext03 = vextq_f32(_row11, _row11, 1);
-        _ext04 = vextq_f32(_row11, _row11, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row10, vget_low_f32(_ker[1]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row11, vget_low_f32(_ker[1]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[1]), 0);
-
-        _row00 = vld1q_f32(input_ptr2);
-        _row01 = vld1q_f32(input_ptr2 + 4);
-
-        _ext01 = vextq_f32(_row00, _row01, 1);
-        _ext02 = vextq_f32(_row00, _row01, 2);
-        _ext03 = vextq_f32(_row01, _row01, 1);
-        _ext04 = vextq_f32(_row01, _row01, 2);
-
-        _result0 = vmlaq_lane_f32(_result0, _row00, vget_low_f32(_ker[2]), 0);
-        _result0 = vmlaq_lane_f32(_result0, _ext01, vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext02, vget_high_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _row01, vget_low_f32(_ker[2]), 0);
-        _result1 = vmlaq_lane_f32(_result1, _ext03, vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext04, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 5:
-            vst1q_lane_f32(output_ptr0 + 4, _result1, 0);
-          case 4:
-            vst1q_f32(output_ptr0, _result0);
-            break;
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            break;
-        }
-
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        output_ptr0 += output_w_remain;
-      }
-      // pad right
-      if (padding_w) {
-        float32x2_t row0 = vld1_f32(input_ptr0);
-        float32x2_t row1 = vld1_f32(input_ptr1);
-        float32x2_t row2 = vld1_f32(input_ptr2);
-        float32x2_t zero = vdup_n_f32(0.f);
-        float32x2_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0.f;
-          } else {
-            acc = vmul_f32(row0, vget_low_f32(_ker[0]));
-            acc = vmla_f32(acc, row1, vget_low_f32(_ker[1]));
-            acc = vmla_f32(acc, row2, vget_low_f32(_ker[2]));
-            float32x2_t sum = vpadd_f32(acc, acc);
-            vst1_lane_f32(output_ptr0, sum, 0);
-            row0 = vext_f32(row0, zero, 1);
-            row1 = vext_f32(row1, zero, 1);
-            row2 = vext_f32(row2, zero, 1);
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv3x3S2<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {
-  const float *input_data = input.data<float>();
-  const float *filter_data = filter.data<float>();
-  float *out_data = output->mutable_data<float>();
-
-  const int input_h = input.dims()[2];
-  const int input_w = input.dims()[3];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-  const int image_size = input_h * input_w;
-  const int out_image_size = output_h * output_w;
-  const int valid_h_start = (padding_h + 1) / 2;
-  const int valid_h_end =
-      std::max((input_h + padding_h - 1) / 2, valid_h_start);
-  const int valid_h =
-      valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0;
-  const int valid_w_start = (padding_w + 1) / 2;
-  const int valid_w_end =
-      std::max((input_w + padding_w - 1) / 2, valid_w_start);
-  const int valid_w = valid_w_end - valid_w_start;
-  const int input_w_start = 2 * valid_w_start - padding_w;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const float *input_ptr = input_data + g * image_size;
-    const float *filter_ptr = filter_data + g * 9;
-    float *output_ptr = out_data + g * out_image_size;
-
-    const float *filter_ptr0 = filter_ptr;
-    const float *filter_ptr1 = filter_ptr0 + 3;
-    const float *filter_ptr2 = filter_ptr1 + 3;
-    float32x4_t _ker[3];
-    _ker[0] = vld1q_f32(filter_ptr0);
-    _ker[1] = vld1q_f32(filter_ptr1);
-    _ker[2] = vld1q_f32(filter_ptr2);
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-    // valid 2x4
-    int output_w_tiles = valid_w / 4;
-    int output_w_remain = valid_w - output_w_tiles * 4;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      const float *input_ptr4 = input_ptr3 + input_w;
-      float *output_ptr0 = output_ptr + h * output_w;
-      float *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-          } else {
-            float32x4_t row0 = vld1q_f32(input_ptr0 - padding);
-            float32x4_t row1 = vld1q_f32(input_ptr1 - padding);
-            float32x4_t row2 = vld1q_f32(input_ptr2 - padding);
-            float32x4_t row3 = vld1q_f32(input_ptr3 - padding);
-            float32x4_t row4 = vld1q_f32(input_ptr4 - padding);
-            float32x4_t acc0 = vmulq_f32(row0, _ker[0]);
-            float32x4_t acc1 = vmulq_f32(row2, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 2);
-            float sum1 = vgetq_lane_f32(acc1, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-              sum1 += vgetq_lane_f32(acc1, 1);
-            }
-            output_ptr0[w] = sum0;
-            output_ptr1[w] = sum1;
-          }
-        }
-        input_ptr0 += input_w_start;
-        input_ptr1 += input_w_start;
-        input_ptr2 += input_w_start;
-        input_ptr3 += input_w_start;
-        input_ptr4 += input_w_start;
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _result1, _ext;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr2);
-        _row1 = vld2q_f32(input_ptr3);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-        _result1 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr3[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr4);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr4[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-        vst1q_f32(output_ptr1, _result1);
-
-        input_ptr0 += 8;
-        input_ptr1 += 8;
-        input_ptr2 += 8;
-        input_ptr3 += 8;
-        input_ptr4 += 8;
-        output_ptr0 += 4;
-        output_ptr1 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr2);
-        _row1 = vld2q_f32(input_ptr3);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-        _result1 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr3[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[1]), 0);
-
-        _row0 = vld2q_f32(input_ptr4);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr4[8], _ext, 3);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[0], vget_low_f32(_ker[2]), 0);
-        _result1 =
-            vmlaq_lane_f32(_result1, _row0.val[1], vget_low_f32(_ker[2]), 1);
-        _result1 = vmlaq_lane_f32(_result1, _ext, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-            vst1q_lane_f32(output_ptr1 + 2, _result1, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            vst1_f32(output_ptr1, vget_low_f32(_result1));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            vst1q_lane_f32(output_ptr1, _result1, 0);
-            break;
-        }
-        input_ptr0 += output_w_remain * 2;
-        input_ptr1 += output_w_remain * 2;
-        input_ptr2 += output_w_remain * 2;
-        input_ptr3 += output_w_remain * 2;
-        input_ptr4 += output_w_remain * 2;
-        output_ptr0 += output_w_remain;
-        output_ptr1 += output_w_remain;
-      }
-      // pad right
-      if (padding_w > 0) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc1 = vmulq_f32(row2, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 0);
-            float sum1 = vgetq_lane_f32(acc1, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-              sum1 += vgetq_lane_f32(acc1, 1);
-            }
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const float *input_ptr0 = input_ptr + (2 * start_h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      float *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-          } else {
-            float32x4_t row0 = vld1q_f32(input_ptr0 - padding);
-            float32x4_t row1 = vld1q_f32(input_ptr1 - padding);
-            float32x4_t row2 = vld1q_f32(input_ptr2 - padding);
-            float32x4_t acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-            }
-            output_ptr0[w] = sum0;
-          }
-        }
-        input_ptr0 += input_w_start;
-        input_ptr1 += input_w_start;
-        input_ptr2 += input_w_start;
-        output_ptr0 += valid_w_start;
-      }
-      // valid
-      float32x4_t _result0, _ext;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-        float32x4x2_t _row2 = vld2q_f32(input_ptr2);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _ext = vextq_f32(_row2.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-
-        vst1q_f32(output_ptr0, _result0);
-
-        input_ptr0 += 8;
-        input_ptr1 += 8;
-        input_ptr2 += 8;
-        output_ptr0 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4x2_t _row0 = vld2q_f32(input_ptr0);
-        float32x4x2_t _row1 = vld2q_f32(input_ptr1);
-        float32x4x2_t _row2 = vld2q_f32(input_ptr2);
-
-        _ext = vextq_f32(_row0.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr0[8], _ext, 3);
-        _result0 = vmulq_lane_f32(_row0.val[0], vget_low_f32(_ker[0]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row0.val[1], vget_low_f32(_ker[0]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[0]), 0);
-
-        _ext = vextq_f32(_row1.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr1[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[0], vget_low_f32(_ker[1]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row1.val[1], vget_low_f32(_ker[1]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[1]), 0);
-
-        _ext = vextq_f32(_row2.val[0], _ext, 1);
-        _ext = vsetq_lane_f32(input_ptr2[8], _ext, 3);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[0], vget_low_f32(_ker[2]), 0);
-        _result0 =
-            vmlaq_lane_f32(_result0, _row2.val[1], vget_low_f32(_ker[2]), 1);
-        _result0 = vmlaq_lane_f32(_result0, _ext, vget_high_f32(_ker[2]), 0);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _result0, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_result0));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _result0, 0);
-            break;
-        }
-        input_ptr0 += output_w_remain * 2;
-        input_ptr1 += output_w_remain * 2;
-        input_ptr2 += output_w_remain * 2;
-        output_ptr0 += output_w_remain;
-      }
-      // pad right
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t acc0;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            float sum0 = vgetq_lane_f32(acc0, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_f32(acc0, 1);
-            }
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/depthwise_conv3x3.h b/mobile/src/operators/math/depthwise_conv3x3.h
deleted file mode 100644
index 1f145c4f94bf2061fb9db74aec84684387809854..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise_conv3x3.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// TODO(hjchen2) need to be implemented
-// template<typename Itype, typename Otype>
-// void DepthwiseConv3x3(const framework::Tensor *input,
-//                      const framework::Tensor *filter,
-//                      const std::vector<int> &strides,
-//                      const std::vector<int> &paddings,
-//                      framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3S1(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv3x3S2(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp b/mobile/src/operators/math/depthwise_conv3x3_int8.cpp
deleted file mode 100644
index e69df3e6bec76e74c64178e8b790642764a7c35c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise_conv3x3_int8.cpp
+++ /dev/null
@@ -1,1660 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#include "operators/math/depthwise_conv3x3.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define DEPTHWISE_CONV_NORMAL_BORDER(start, end)                         \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 3;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    int32_t value = 0;                                                   \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride = 1>
-inline void Depth3x3NormalRowLoadInput(const int8_t *input, int16x8_t *y) {
-  y[0] = vmovl_s8(vld1_s8(input));
-  y[1] = vextq_s16(y[0], y[0], 1);
-  y[2] = vextq_s16(y[1], y[1], 1);
-}
-
-template <>
-inline void Depth3x3NormalRowLoadInput<2>(const int8_t *input, int16x8_t *y) {
-  int8x8x2_t x0 = vld2_s8(input);
-  y[0] = vmovl_s8(x0.val[0]);
-  y[1] = vmovl_s8(x0.val[1]);
-  y[2] = vextq_s16(y[0], y[0], 1);
-}
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      int32_t *output, int16x4_t *ker) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 3;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  const int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  const int valid_w_end = (input_w + padding_w - 3) / Stride_w + 1;
-  int32_t *output_ptr = output + h_output * output_w;
-  // border left
-  DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) / 6;
-  int remain_start = valid_w_start + output_tiles * 6;
-  int32x4_t _sum0, _sum1;
-  int16x8_t _y[3];
-  for (int w = 0; w < output_tiles * 6; w += 6) {
-    _sum0 = veorq_s32(_sum0, _sum0);
-    _sum1 = veorq_s32(_sum1, _sum1);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth3x3NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _y);
-      _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[0]), ker[index], 0);
-      _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[1]), ker[index], 1);
-      _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_y[2]), ker[index], 2);
-      _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[0]), ker[index], 0);
-      _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[1]), ker[index], 1);
-      _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_y[2]), ker[index], 2);
-    }
-    vst1q_s32(output_ptr + output_offset, _sum0);
-    vst1_s32(output_ptr + output_offset + 4, vget_low_s32(_sum1));
-  }
-  for (int w = remain_start; w < valid_w_end; ++w) {
-    int32_t value = 0;
-    int input_start = -padding_w + w * Stride_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      for (int j = 0; j < 3; ++j) {
-        value += filter[(h_in - h_in_start) * 3 + j] *
-                 input[h_in * input_w + j + input_start];
-      }
-    }
-    output_ptr[w] = value;
-  }
-  // border right
-  DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  const int8_t *input_data = input.data<int8_t>();
-  const int8_t *filter_data = filter.data<int8_t>();
-  int32_t *out_data = output->mutable_data<int32_t>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int padding_h = paddings[0];
-  int padding_w = paddings[1];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  int valid_h_start = padding_h;
-  int valid_h_end = output_h - valid_h_start;
-  int valid_h = valid_h_end - valid_h_start;
-  int valid_w_start = padding_w;
-  int valid_w_end = output_w - valid_w_start;
-  int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const int8_t *input_ptr = input_data + g * image_size;
-    const int8_t *filter_ptr = filter_data + g * 9;
-    int32_t *output_ptr = out_data + g * out_image_size;
-
-    const int8_t *filter_ptr0 = filter_ptr;
-    const int8_t *filter_ptr1 = filter_ptr0 + 3;
-    const int8_t *filter_ptr2 = filter_ptr1 + 3;
-    int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0)));
-    int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1)));
-    int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2)));
-    int16x8_t _ker0 = vcombine_s16(_k0, _k1);
-    int16x8_t _ker1 = vcombine_s16(_k2, _k2);
-    int16x4_t zero = vdup_n_s16(0);
-    int16x4_t _ker[3] = {_k0, _k1, _k2};
-    // top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-    // valid
-    int output_w_tiles = valid_w / 6;
-    int output_w_remain = valid_w - output_w_tiles * 6;
-    for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
-      const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      const int8_t *input_ptr5 = input_ptr4 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      int32_t *output_ptr2 = output_ptr1 + output_w;
-      int32_t *output_ptr3 = output_ptr2 + output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-            output_ptr2[w] = 0;
-            output_ptr3[w] = 0;
-          } else {
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            row4 = vext_s16(zero, row4, 3);
-            row5 = vext_s16(zero, row5, 3);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            output_ptr1[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row2, _ker[0]);
-            acc = vmlal_s16(acc, row3, _ker[1]);
-            acc = vmlal_s16(acc, row4, _ker[2]);
-            output_ptr2[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row3, _ker[0]);
-            acc = vmlal_s16(acc, row4, _ker[1]);
-            acc = vmlal_s16(acc, row5, _ker[2]);
-            output_ptr3[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-        output_ptr2 += valid_w_start;
-        output_ptr3 += valid_w_start;
-      }
-#if __aarch64__
-#else
-      // valid
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                  \n"
-          "ble        start_remain_%=              \n"
-          "mov        r0, #6                       \n"
-          // loop 6 width
-          "loop_4h6w_%=:                           \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 0, reuse q10/q11
-          "vst1.32    {d20-d22}, [%[output_ptr0]]! \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q14, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker0][2]        \n"
-          "vmull.s16  q15, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker0][2]        \n"
-
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr4]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr5]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-          // store row 1
-          "vst1.32    {d24-d26}, [%[output_ptr1]]! \n"
-
-          "vmlal.s16  q14, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q15, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q14, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q15, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker1][2]        \n"
-          // store row 2
-          "vst1.32    {d28-d30}, [%[output_ptr2]]! \n"
-
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 3
-          "vst1.32    {d20-d22}, [%[output_ptr3]]! \n"
-
-          "subs       %[loop], #1                  \n"
-          "bne        loop_4h6w_%=                 \n"
-
-          "start_remain_%=:                        \n"
-          "cmp        %[remain], #0                \n"
-          "ble        end_%=                       \n"
-
-          "mov        r0, %[remain]                \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vld1.32    {d9}, [%[input_ptr1]], r0    \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vld1.32    {d9}, [%[input_ptr2]], r0    \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q14, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker0][2]        \n"
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vmull.s16  q15, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker0][2]        \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q14, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q14, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q14, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q15, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q15, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q15, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q5, d14, %e[ker0][0]         \n"
-          "vmlal.s16  q5, d16, %e[ker0][1]         \n"
-          "vmlal.s16  q5, d18, %e[ker0][2]         \n"
-          "vld1.32    {d9}, [%[input_ptr4]], r0    \n"
-          "vmull.s16  q6, d15, %e[ker0][0]         \n"
-          "vmlal.s16  q6, d17, %e[ker0][1]         \n"
-          "vmlal.s16  q6, d19, %e[ker0][2]         \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q14, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q14, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q14, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q15, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q15, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q15, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q5, d14, %f[ker0][0]         \n"
-          "vmlal.s16  q5, d16, %f[ker0][1]         \n"
-          "vmlal.s16  q5, d18, %f[ker0][2]         \n"
-          "vld1.32    {d9}, [%[input_ptr5]], r0    \n"
-          "vmlal.s16  q6, d15, %f[ker0][0]         \n"
-          "vmlal.s16  q6, d17, %f[ker0][1]         \n"
-          "vmlal.s16  q6, d19, %f[ker0][2]         \n"
-
-          "vmovl.s8   q7, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q8, d9                       \n"
-          "vext.s8    d9, d9, d9, #1               \n"
-          "vmovl.s8   q9, d9                       \n"
-          "vmlal.s16  q5, d14, %e[ker1][0]         \n"
-          "vmlal.s16  q5, d16, %e[ker1][1]         \n"
-          "vmlal.s16  q5, d18, %e[ker1][2]         \n"
-          "vmlal.s16  q6, d15, %e[ker1][0]         \n"
-          "vmlal.s16  q6, d17, %e[ker1][1]         \n"
-          "vmlal.s16  q6, d19, %e[ker1][2]         \n"
-
-          "cmp        %[remain], #4                \n"
-          "blt        store_4h2w_%=                \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!     \n"
-          "vst1.32    {q12}, [%[output_ptr1]]!     \n"
-          "vst1.32    {q14}, [%[output_ptr2]]!     \n"
-          "vst1.32    {q5}, [%[output_ptr3]]!      \n"
-          "cmp        %[remain], #5                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d26[0]}, [%[output_ptr1]]!  \n"
-          "vst1.32    {d30[0]}, [%[output_ptr2]]!  \n"
-          "vst1.32    {d12[0]}, [%[output_ptr3]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_4h2w_%=:                          \n"
-          "cmp        %[remain], #2                \n"
-          "blt        store_4h1w_%=                \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d24}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d28}, [%[output_ptr2]]!     \n"
-          "vst1.32    {d10}, [%[output_ptr3]]!     \n"
-          "cmp        %[remain], #3                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d25[0]}, [%[output_ptr1]]!  \n"
-          "vst1.32    {d29[0]}, [%[output_ptr2]]!  \n"
-          "vst1.32    {d11[0]}, [%[output_ptr3]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_4h1w_%=:                          \n"
-          "cmp        %[remain], #1                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d24[0]}, [%[output_ptr1]]!  \n"
-          "vst1.32    {d28[0]}, [%[output_ptr2]]!  \n"
-          "vst1.32    {d10[0]}, [%[output_ptr3]]!  \n"
-          "end_%=:                                 \n"
-          : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [output_ptr2] "+r"(output_ptr2), [output_ptr3] "+r"(output_ptr3),
-            [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - 2)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4 - 2)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5 - 2)));
-        row0 = vext_s16(row0, zero, 2);
-        row1 = vext_s16(row1, zero, 2);
-        row2 = vext_s16(row2, zero, 2);
-        row3 = vext_s16(row3, zero, 2);
-        row4 = vext_s16(row4, zero, 2);
-        row5 = vext_s16(row5, zero, 2);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-            *output_ptr2 = 0;
-            *output_ptr3 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            *output_ptr1 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row2, _ker[0]);
-            acc = vmlal_s16(acc, row3, _ker[1]);
-            acc = vmlal_s16(acc, row4, _ker[2]);
-            *output_ptr2 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row3, _ker[0]);
-            acc = vmlal_s16(acc, row4, _ker[1]);
-            acc = vmlal_s16(acc, row5, _ker[2]);
-            *output_ptr3 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-            row4 = vext_s16(row4, zero, 1);
-            row5 = vext_s16(row5, zero, 1);
-          }
-          output_ptr0++;
-          output_ptr1++;
-          output_ptr2++;
-          output_ptr3++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xFFFFFFFC);
-    for (int h = start_h; h < valid_h_end - 1; h += 2) {
-      const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-          } else {
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            output_ptr1[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                  \n"
-          "ble        start_remain_%=              \n"
-          "mov        r0, #6                       \n"
-          // loop 6 widths
-          "loop_2h6w_%=:                           \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 0, reuse q10/q11
-          "vst1.32    {d20-d22}, [%[output_ptr0]]! \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-          // store row 1
-          "vst1.32    {d24-d26}, [%[output_ptr1]]! \n"
-
-          "subs       %[loop], #1                  \n"
-          "bne        loop_2h6w_%=                 \n"
-
-          "start_remain_%=:                        \n"
-          "cmp        %[remain], #0                \n"
-          "ble        end_%=                       \n"
-
-          "mov        r0, %[remain]                \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]        \n"
-          "vmull.s16  q13, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-
-          "vmlal.s16  q12, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]        \n"
-
-          "vld1.32    {d9}, [%[input_ptr3]], r0    \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]        \n"
-
-          "cmp        %[remain], #4                \n"
-          "blt        store_2h2w_%=                \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!     \n"
-          "vst1.32    {q12}, [%[output_ptr1]]!     \n"
-          "cmp        %[remain], #5                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d26[0]}, [%[output_ptr1]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_2h2w_%=:                          \n"
-          "cmp        %[remain], #2                \n"
-          "blt        store_2h1w_%=                \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d24}, [%[output_ptr1]]!     \n"
-          "cmp        %[remain], #3                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d25[0]}, [%[output_ptr1]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_2h1w_%=:                          \n"
-          "cmp        %[remain], #1                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!  \n"
-          "vst1.32    {d24[0]}, [%[output_ptr1]]!  \n"
-          "end_%=:                                 \n"
-          : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - 2)));
-        row0 = vext_s16(row0, zero, 2);
-        row1 = vext_s16(row1, zero, 2);
-        row2 = vext_s16(row2, zero, 2);
-        row3 = vext_s16(row3, zero, 2);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-            acc = vmull_s16(row1, _ker[0]);
-            acc = vmlal_s16(acc, row2, _ker[1]);
-            acc = vmlal_s16(acc, row3, _ker[2]);
-            *output_ptr1 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-
-    start_h = valid_h_start + (valid_h & 0xFFFFFFFE);
-    if (start_h < valid_h_end) {
-      const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      int32_t *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-          } else {
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            output_ptr0[w] = vgetq_lane_s32(acc, 1) + vgetq_lane_s32(acc, 2);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                  \n"
-          "ble        start_remain_%=              \n"
-          "mov        r0, #6                       \n"
-          // loop 6 widths
-          "loop_1h6w_%=:                           \n"
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-          // store row 0, reuse q10/q11
-          "vst1.32    {d20-d22}, [%[output_ptr0]]! \n"
-
-          "subs       %[loop], #1                  \n"
-          "bne        loop_1h6w_%=                 \n"
-
-          "start_remain_%=:                        \n"
-          "cmp        %[remain], #0                \n"
-          "ble        end_%=                       \n"
-          "mov        r0, %[remain]                \n"
-
-          "vld1.32    {d9}, [%[input_ptr0]], r0    \n"
-          "vld1.32    {d10}, [%[input_ptr1]], r0   \n"
-          "vld1.32    {d11}, [%[input_ptr2]], r0   \n"
-          "vext.s8    d12, d9, d9, #1              \n"
-          "vext.s8    d13, d9, d9, #2              \n"
-          "vmovl.s8   q7, d9                       \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmull.s16  q10, d14, %e[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]        \n"
-          "vmull.s16  q11, d15, %e[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]        \n"
-
-          "vext.s8    d12, d10, d10, #1            \n"
-          "vext.s8    d13, d10, d10, #2            \n"
-          "vmovl.s8   q7, d10                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]        \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]        \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]        \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]        \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]        \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]        \n"
-
-          "vext.s8    d12, d11, d11, #1            \n"
-          "vext.s8    d13, d11, d11, #2            \n"
-          "vmovl.s8   q7, d11                      \n"
-          "vmovl.s8   q8, d12                      \n"
-          "vmovl.s8   q9, d13                      \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]        \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]        \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]        \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]        \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]        \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]        \n"
-
-          "cmp        %[remain], #4                \n"
-          "blt        store_1h2w_%=                \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!     \n"
-          "cmp        %[remain], #5                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_1h2w_%=:                          \n"
-          "cmp        %[remain], #2                \n"
-          "blt        store_1h1w_%=                \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!     \n"
-          "cmp        %[remain], #3                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!  \n"
-          "b          end_%=                       \n"
-
-          "store_1h1w_%=:                          \n"
-          "cmp        %[remain], #1                \n"
-          "blt        end_%=                       \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!  \n"
-          "end_%=:                                 \n"
-          : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0),
-            [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - 2)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - 2)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - 2)));
-        row0 = vext_s16(row0, zero, 2);
-        row1 = vext_s16(row1, zero, 2);
-        row2 = vext_s16(row2, zero, 2);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            *output_ptr0 = vgetq_lane_s32(acc, 0) + vgetq_lane_s32(acc, 1);
-
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  const int8_t *input_data = input.data<int8_t>();
-  const int8_t *filter_data = filter.data<int8_t>();
-  int32_t *out_data = output->mutable_data<int32_t>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int padding_h = paddings[0];
-  int padding_w = paddings[1];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  int valid_h_start = (padding_h + 1) / 2;
-  int valid_h_end = (input_h + padding_h - 1) / 2;
-  int valid_h = valid_h_end - valid_h_start;
-  int valid_w_start = (padding_w + 1) / 2;
-  int valid_w_end = (input_w + padding_w - 1) / 2;
-  int valid_w = valid_w_end - valid_w_start;
-  // for pad left
-  int valid_input_w_start = (valid_w_start << 1) - padding_w;
-
-  //  DLOG << "valid_h_start: " << valid_h_start;
-  //  DLOG << "valid_h_end: " << valid_h_end;
-  //  DLOG << "valid_w_start: " << valid_w_start;
-  //  DLOG << "valid_w_end: " << valid_w_end;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const int8_t *input_ptr = input_data + g * image_size;
-    const int8_t *filter_ptr = filter_data + g * 9;
-    int32_t *output_ptr = out_data + g * out_image_size;
-
-    const int8_t *filter_ptr0 = filter_ptr;
-    const int8_t *filter_ptr1 = filter_ptr0 + 3;
-    const int8_t *filter_ptr2 = filter_ptr1 + 3;
-    int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0)));
-    int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1)));
-    int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2)));
-    int16x8_t _ker0 = vcombine_s16(_k0, _k1);
-    int16x8_t _ker1 = vcombine_s16(_k2, _k2);
-    int16x4_t _ker[3] = {_k0, _k1, _k2};
-
-    // top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-    // valid
-    int input_w_start = 2 * valid_w_start - padding_w;
-    int output_w_tiles = valid_w / 6;
-    int output_w_remain = valid_w - output_w_tiles * 6;
-    for (int h = valid_h_start; h < valid_h_end - 2; h += 3) {
-      const int8_t *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      const int8_t *input_ptr5 = input_ptr4 + input_w;
-      const int8_t *input_ptr6 = input_ptr5 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      int32_t *output_ptr2 = output_ptr1 + output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-            output_ptr2[w] = 0;
-          } else {
-            int16x4_t row0 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - padding)));
-            int16x4_t row1 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - padding)));
-            int16x4_t row2 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - padding)));
-            int16x4_t row3 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr3 - padding)));
-            int16x4_t row4 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr4 - padding)));
-            int16x4_t row5 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr5 - padding)));
-            int16x4_t row6 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr6 - padding)));
-            int32x4_t acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            int32x4_t acc1 = vmull_s16(row2, _ker[0]);
-            acc1 = vmlal_s16(acc1, row3, _ker[1]);
-            acc1 = vmlal_s16(acc1, row4, _ker[2]);
-            int32x4_t acc2 = vmull_s16(row4, _ker[0]);
-            acc2 = vmlal_s16(acc2, row5, _ker[1]);
-            acc2 = vmlal_s16(acc2, row6, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc0, 2);
-            int32_t sum1 = vgetq_lane_s32(acc1, 2);
-            int32_t sum2 = vgetq_lane_s32(acc2, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc0, 1);
-              sum1 += vgetq_lane_s32(acc1, 1);
-              sum2 += vgetq_lane_s32(acc2, 1);
-            }
-            output_ptr0[w] = sum0;
-            output_ptr1[w] = sum1;
-            output_ptr2[w] = sum2;
-          }
-        }
-        input_ptr0 += valid_input_w_start;
-        input_ptr1 += valid_input_w_start;
-        input_ptr2 += valid_input_w_start;
-        input_ptr3 += valid_input_w_start;
-        input_ptr4 += valid_input_w_start;
-        input_ptr5 += valid_input_w_start;
-        input_ptr6 += valid_input_w_start;
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-        output_ptr2 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain_%=                 \n"
-          "mov        r0, #12                         \n"
-          // loop 6 widths
-          "loop_3h6w_%=:                              \n"
-          "vld2.8     {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vld2.8     {d14-d15}, [%[input_ptr2]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmull.s16  q11, d16, %e[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]           \n"
-          "vmull.s16  q12, d17, %e[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]           \n"
-
-          "vext.s8    d9, d14, d14, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]           \n"
-          // store row 0, reuse q11/q12
-          "vst1.32    {d22-d24}, [%[output_ptr0]]!    \n"
-
-          "vmull.s16  q13, d16, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d18, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d20, %e[ker0][2]           \n"
-          "vmull.s16  q14, d17, %e[ker0][0]           \n"
-          "vmlal.s16  q14, d19, %e[ker0][1]           \n"
-          "vmlal.s16  q14, d21, %e[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr3]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr4]], r0  \n"
-          "vld2.8     {d14-d15}, [%[input_ptr5]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q13, d16, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d18, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d17, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d19, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d21, %f[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q13, d16, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d18, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d17, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d19, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d21, %e[ker1][2]           \n"
-          // store row 1
-          "vst1.32    {d26-d28}, [%[output_ptr1]]!    \n"
-
-          "vmull.s16  q11, d16, %e[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]           \n"
-          "vmull.s16  q12, d17, %e[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d14, d14, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]           \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]           \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]           \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]           \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr6]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q10, d9                         \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]           \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]           \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]           \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]           \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]           \n"
-          // store row 2
-          "vst1.32    {d22-d24}, [%[output_ptr2]]!    \n"
-
-          "subs       %[loop], #1                     \n"
-          "bne        loop_3h6w_%=                    \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain], lsl #1           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmull.s16  q10, d14, %e[ker0][0]           \n"
-          "vmlal.s16  q10, d16, %e[ker0][1]           \n"
-          "vmlal.s16  q10, d18, %e[ker0][2]           \n"
-          "vmull.s16  q11, d15, %e[ker0][0]           \n"
-          "vmlal.s16  q11, d17, %e[ker0][1]           \n"
-          "vmlal.s16  q11, d19, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d12                         \n"
-          "vmovl.s8   q8, d13                         \n"
-          "vmlal.s16  q10, d14, %f[ker0][0]           \n"
-          "vmlal.s16  q10, d16, %f[ker0][1]           \n"
-          "vmlal.s16  q10, d18, %f[ker0][2]           \n"
-          "vmlal.s16  q11, d15, %f[ker0][0]           \n"
-          "vmlal.s16  q11, d17, %f[ker0][1]           \n"
-          "vmlal.s16  q11, d19, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr2]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr3]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmlal.s16  q10, d14, %e[ker1][0]           \n"
-          "vmlal.s16  q10, d16, %e[ker1][1]           \n"
-          "vmlal.s16  q10, d18, %e[ker1][2]           \n"
-          "vmlal.s16  q11, d15, %e[ker1][0]           \n"
-          "vmlal.s16  q11, d17, %e[ker1][1]           \n"
-          "vmlal.s16  q11, d19, %e[ker1][2]           \n"
-
-          "vmull.s16  q12, d14, %e[ker0][0]           \n"
-          "vmlal.s16  q12, d16, %e[ker0][1]           \n"
-          "vmlal.s16  q12, d18, %e[ker0][2]           \n"
-          "vmull.s16  q13, d15, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d19, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d12                         \n"
-          "vmovl.s8   q8, d13                         \n"
-          "vmlal.s16  q12, d14, %f[ker0][0]           \n"
-          "vmlal.s16  q12, d16, %f[ker0][1]           \n"
-          "vmlal.s16  q12, d18, %f[ker0][2]           \n"
-          "vmlal.s16  q13, d15, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d17, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d19, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr4]], r0  \n"
-          "vld2.8     {d12-d13}, [%[input_ptr5]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmlal.s16  q12, d14, %e[ker1][0]           \n"
-          "vmlal.s16  q12, d16, %e[ker1][1]           \n"
-          "vmlal.s16  q12, d18, %e[ker1][2]           \n"
-          "vmlal.s16  q13, d15, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d19, %e[ker1][2]           \n"
-
-          "vmull.s16  q14, d14, %e[ker0][0]           \n"
-          "vmlal.s16  q14, d16, %e[ker0][1]           \n"
-          "vmlal.s16  q14, d18, %e[ker0][2]           \n"
-          "vmull.s16  q15, d15, %e[ker0][0]           \n"
-          "vmlal.s16  q15, d17, %e[ker0][1]           \n"
-          "vmlal.s16  q15, d19, %e[ker0][2]           \n"
-
-          "vext.s8    d9, d12, d12, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d12                         \n"
-          "vmovl.s8   q8, d13                         \n"
-          "vmlal.s16  q14, d14, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d18, %f[ker0][2]           \n"
-          "vmlal.s16  q15, d15, %f[ker0][0]           \n"
-          "vmlal.s16  q15, d17, %f[ker0][1]           \n"
-          "vmlal.s16  q15, d19, %f[ker0][2]           \n"
-
-          "vld2.8     {d10-d11}, [%[input_ptr6]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                \n"
-          "vmovl.s8   q9, d9                          \n"
-          "vmovl.s8   q7, d10                         \n"
-          "vmovl.s8   q8, d11                         \n"
-          "vmlal.s16  q14, d14, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d16, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d18, %e[ker1][2]           \n"
-          "vmlal.s16  q15, d15, %e[ker1][0]           \n"
-          "vmlal.s16  q15, d17, %e[ker1][1]           \n"
-          "vmlal.s16  q15, d19, %e[ker1][2]           \n"
-
-          "cmp        %[remain], #4                   \n"
-          "blt        store_3h2w_%=                   \n"
-          "vst1.32    {q10}, [%[output_ptr0]]!        \n"
-          "vst1.32    {q12}, [%[output_ptr1]]!        \n"
-          "vst1.32    {q14}, [%[output_ptr2]]!        \n"
-          "cmp        %[remain], #5                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d26[0]}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d30[0]}, [%[output_ptr2]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_3h2w_%=:                             \n"
-          "cmp        %[remain], #2                   \n"
-          "blt        store_3h1w_%=                   \n"
-          "vst1.32    {d20}, [%[output_ptr0]]!        \n"
-          "vst1.32    {d24}, [%[output_ptr1]]!        \n"
-          "vst1.32    {d28}, [%[output_ptr2]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d21[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d25[0]}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d29[0]}, [%[output_ptr2]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_3h1w_%=:                             \n"
-          "cmp        %[remain], #1                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d20[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d24[0]}, [%[output_ptr1]]!     \n"
-          "vst1.32    {d28[0]}, [%[output_ptr2]]!     \n"
-          "end_%=:                                    \n"
-          : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [output_ptr2] "+r"(output_ptr2), [input_ptr6] "+r"(input_ptr6),
-            [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w > 0) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int16x4_t row6 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr6)));
-        int32x4_t acc0, acc1, acc2;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-            *output_ptr2 = 0;
-          } else {
-            acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            acc1 = vmull_s16(row2, _ker[0]);
-            acc1 = vmlal_s16(acc1, row3, _ker[1]);
-            acc1 = vmlal_s16(acc1, row4, _ker[2]);
-            acc2 = vmull_s16(row4, _ker[0]);
-            acc2 = vmlal_s16(acc2, row5, _ker[1]);
-            acc2 = vmlal_s16(acc2, row6, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc0, 0);
-            int32_t sum1 = vgetq_lane_s32(acc1, 0);
-            int32_t sum2 = vgetq_lane_s32(acc2, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc0, 1);
-              sum1 += vgetq_lane_s32(acc1, 1);
-              sum2 += vgetq_lane_s32(acc2, 1);
-            }
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-            *output_ptr2 = sum2;
-          }
-          output_ptr0++;
-          output_ptr1++;
-          output_ptr2++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + valid_h / 3 * 3;
-    for (int h = start_h; h < valid_h_end; ++h) {
-      const int8_t *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      // pad left
-      if (padding_w) {
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - (w << 1);
-          if (padding >= 3) {
-            output_ptr0[w] = 0;
-          } else {
-            int16x4_t row0 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr0 - padding)));
-            int16x4_t row1 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr1 - padding)));
-            int16x4_t row2 =
-                vget_low_s16(vmovl_s8(vld1_s8(input_ptr2 - padding)));
-            int32x4_t acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc, 2);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc, 1);
-            }
-            output_ptr0[w] = sum0;
-          }
-        }
-        input_ptr0 += valid_input_w_start;
-        input_ptr1 += valid_input_w_start;
-        input_ptr2 += valid_input_w_start;
-        output_ptr0 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                      \n"
-          "ble        start_remain_%=                  \n"
-          "mov        r0, #12                          \n"
-          // loop 6 widths
-          "loop_1h6w_%=:                               \n"
-          "vld2.8     {d10, d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12, d13}, [%[input_ptr1]], r0  \n"
-          "vld2.8     {d14, d15}, [%[input_ptr2]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d10                          \n"
-          "vmovl.s8   q9, d11                          \n"
-          "vmull.s16  q11, d16, %e[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]            \n"
-          "vmull.s16  q12, d17, %e[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]            \n"
-
-          "vext.s8    d9, d12, d12, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d12                          \n"
-          "vmovl.s8   q9, d13                          \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]            \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]            \n"
-
-          "vext.s8    d9, d14, d14, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d14                          \n"
-          "vmovl.s8   q9, d15                          \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]            \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]            \n"
-          // store row 0
-          "vst1.32    {d22-d24}, [%[output_ptr0]]!     \n"
-
-          "subs       %[loop], #1                      \n"
-          "bne        loop_1h6w_%=                     \n"
-
-          "start_remain_%=:                            \n"
-          "cmp        %[remain], #0                    \n"
-          "ble        end_%=                           \n"
-          "mov        r0, %[remain], lsl #1            \n"
-
-          "vld2.8     {d10, d11}, [%[input_ptr0]], r0  \n"
-          "vld2.8     {d12, d13}, [%[input_ptr1]], r0  \n"
-          "vld2.8     {d14, d15}, [%[input_ptr2]], r0  \n"
-          "vext.s8    d9, d10, d10, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d10                          \n"
-          "vmovl.s8   q9, d11                          \n"
-          "vmull.s16  q11, d16, %e[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker0][2]            \n"
-          "vmull.s16  q12, d17, %e[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker0][2]            \n"
-
-          "vext.s8    d9, d12, d12, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d12                          \n"
-          "vmovl.s8   q9, d13                          \n"
-          "vmlal.s16  q11, d16, %f[ker0][0]            \n"
-          "vmlal.s16  q11, d18, %f[ker0][1]            \n"
-          "vmlal.s16  q11, d20, %f[ker0][2]            \n"
-          "vmlal.s16  q12, d17, %f[ker0][0]            \n"
-          "vmlal.s16  q12, d19, %f[ker0][1]            \n"
-          "vmlal.s16  q12, d21, %f[ker0][2]            \n"
-
-          "vext.s8    d9, d14, d14, #1                 \n"
-          "vmovl.s8   q10, d9                          \n"
-          "vmovl.s8   q8, d14                          \n"
-          "vmovl.s8   q9, d15                          \n"
-          "vmlal.s16  q11, d16, %e[ker1][0]            \n"
-          "vmlal.s16  q11, d18, %e[ker1][1]            \n"
-          "vmlal.s16  q11, d20, %e[ker1][2]            \n"
-          "vmlal.s16  q12, d17, %e[ker1][0]            \n"
-          "vmlal.s16  q12, d19, %e[ker1][1]            \n"
-          "vmlal.s16  q12, d21, %e[ker1][2]            \n"
-
-          "cmp        %[remain], #4                    \n"
-          "blt        store_1h2w_%=                    \n"
-          "vst1.32    {q11}, [%[output_ptr0]]!         \n"
-          "cmp        %[remain], #5                    \n"
-          "blt        end_%=                           \n"
-          "vst1.32    {d24[0]}, [%[output_ptr0]]!      \n"
-          "b          end_%=                           \n"
-
-          "store_1h2w_%=:                              \n"
-          "cmp        %[remain], #2                    \n"
-          "blt        store_1h1w_%=                    \n"
-          "vst1.32    {d22}, [%[output_ptr0]]!         \n"
-          "cmp        %[remain], #3                    \n"
-          "blt        end_%=                           \n"
-          "vst1.32    {d23[0]}, [%[output_ptr0]]!      \n"
-          "b          end_%=                           \n"
-
-          "store_1h1w_%=:                              \n"
-          "cmp        %[remain], #1                    \n"
-          "blt        end_%=                           \n"
-          "vst1.32    {d22[0]}, [%[output_ptr0]]!      \n"
-          "end_%=:                                     \n"
-          : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0),
-            [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [ker0] "w"(_ker0), [ker1] "w"(_ker1)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w > 0) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = 2 * w + 3 - (padding_w + input_w);
-          if (padding >= 3) {
-            *output_ptr0 = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            int32_t sum0 = vgetq_lane_s32(acc, 0);
-            if (padding == 1) {
-              sum0 += vgetq_lane_s32(acc, 1);
-            }
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker);
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/depthwise_conv5x5.cpp b/mobile/src/operators/math/depthwise_conv5x5.cpp
deleted file mode 100644
index a721cce71efcc44328274f41f812b24da7d2370e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise_conv5x5.cpp
+++ /dev/null
@@ -1,1106 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "operators/math/depthwise_conv5x5.h"
-#include <arm_neon.h>
-#include <iostream>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifndef __aarch64__
-inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) {
-  float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
-  float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
-  return vcombine_f32(sum0, sum1);
-}
-#endif
-
-template <int Stride = 1>
-inline void Depth5x5NormalRowLoadInput(const float *input, float32x4_t *y) {
-  y[0] = vld1q_f32(input);
-  y[4] = vld1q_f32(input + 4);
-  y[1] = vextq_f32(y[0], y[4], 1);
-  y[2] = vextq_f32(y[0], y[4], 2);
-  y[3] = vextq_f32(y[0], y[4], 3);
-}
-
-template <>
-inline void Depth5x5NormalRowLoadInput<2>(const float *input, float32x4_t *y) {
-  float32x4x2_t x = vld2q_f32(input);
-  y[0] = x.val[0];
-  y[1] = x.val[1];
-  y[2] = vextq_f32(y[0], y[0], 1);
-  y[3] = vextq_f32(y[1], y[1], 1);
-  y[4] = vextq_f32(y[0], y[0], 2);
-}
-
-#define DEPTHWISE_CONV5X5_NORMAL_BORDER(start, end)                      \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 5;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    float value = 0;                                                     \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv5x5NormalRow(const float *input, const float *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      float *output, float32x4_t *ker,
-                                      float32_t *ker1) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 5;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  int valid_w_end = (input_w + padding_w - 5) / Stride_w + 1;
-  if (valid_w_end < valid_w_start) {
-    valid_w_end = valid_w_start;
-  }
-  float *output_ptr = output + h_output * output_w;
-
-  // border left
-  DEPTHWISE_CONV5X5_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) >> 2;
-  float32x4_t _sum, _x[5];
-  // valid w
-  for (int w = 0; w < output_tiles * 4; w += 4) {
-    _sum = vdupq_n_f32(0.f);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1);
-    }
-    vst1q_f32(output_ptr + output_offset, _sum);
-  }
-  // remain valid w
-  int remain = (valid_w_end - valid_w_start) & 0x3;
-  if (remain > 0) {
-    _sum = vdupq_n_f32(0.f);
-    int remain_start = valid_w_start + (output_tiles << 2);
-    int input_w_offset = remain_start * Stride_w - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlaq_n_f32(_sum, _x[0], ker1[index]);
-      _sum = vmlaq_lane_f32(_sum, _x[1], vget_low_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[2], vget_low_f32(ker[index]), 1);
-      _sum = vmlaq_lane_f32(_sum, _x[3], vget_high_f32(ker[index]), 0);
-      _sum = vmlaq_lane_f32(_sum, _x[4], vget_high_f32(ker[index]), 1);
-    }
-    switch (remain) {
-      case 3:
-        vst1q_lane_f32(output_ptr0 + 2, _sum, 2);
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(_sum));
-        break;
-      case 1:
-        vst1q_lane_f32(output_ptr0, _sum, 0);
-        break;
-    }
-  }
-  // border right
-  DEPTHWISE_CONV5X5_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv5x5S1<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {
-  const float *input_data = input.data<float>();
-  const float *filter_data = filter.data<float>();
-  float *out_data = output->mutable_data<float>();
-
-  const int input_h = input.dims()[2];
-  const int input_w = input.dims()[3];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-  const int image_size = input_h * input_w;
-  const int out_image_size = output_h * output_w;
-  const int valid_h_start = padding_h;
-  const int valid_h_end = output_h - valid_h_start;
-  const int valid_h = valid_h_end - valid_h_start;
-  const int valid_w_start = padding_w;
-  const int valid_w_end = output_w - valid_w_start;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < output->dims()[1]; ++g) {
-    const float *input_ptr = input_data + g * image_size;
-    const float *filter_ptr = filter_data + g * 25;
-    float *output_ptr = out_data + g * out_image_size;
-
-    const float *filter_ptr0 = filter_ptr;
-    const float *filter_ptr1 = filter_ptr0 + 5;
-    const float *filter_ptr2 = filter_ptr1 + 5;
-    const float *filter_ptr3 = filter_ptr2 + 5;
-    const float *filter_ptr4 = filter_ptr3 + 5;
-    float32x4_t _ker[7];
-    float32_t _ker1[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2,
-                          *filter_ptr3, *filter_ptr4};
-    _ker[0] = vld1q_f32(filter_ptr0 + 1);
-    _ker[1] = vld1q_f32(filter_ptr1 + 1);
-    _ker[2] = vld1q_f32(filter_ptr2 + 1);
-    _ker[3] = vld1q_f32(filter_ptr3 + 1);
-    _ker[4] = vld1q_f32(filter_ptr4 + 1);
-    _ker[5] = vld1q_f32(_ker1);
-    _ker[6] = vld1q_f32(_ker1 + 4);
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, _ker1);
-    }
-
-    // output 4x4
-    int output_w_tiles = valid_w / 4;
-    int output_w_remain = valid_w - output_w_tiles * 4;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      const float *input_ptr4 = input_ptr3 + input_w;
-      const float *input_ptr5 = input_ptr4 + input_w;
-      float *output_ptr0 = output_ptr + h * output_w;
-      float *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t row5 = vld1q_f32(input_ptr5);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0.f;
-            output_ptr1[w] = 0.f;
-          } else {
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc0 = vmlaq_f32(acc0, row3, _ker[3]);
-            acc0 = vmlaq_f32(acc0, row4, _ker[4]);
-            acc1 = vmulq_f32(row1, _ker[0]);
-            acc1 = vmlaq_f32(acc1, row2, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[3]);
-            acc1 = vmlaq_f32(acc1, row5, _ker[4]);
-            acc0 = vpaddq_f32(acc0, acc1);
-            float32x2_t sum =
-                vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-            vst1_lane_f32(output_ptr1 + w, sum, 1);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-            row3 = vextq_f32(zero, row3, 3);
-            row4 = vextq_f32(zero, row4, 3);
-            row5 = vextq_f32(zero, row5, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-      float32x4_t _q14, _q15;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q15 = vmulq_lane_f32(_q9, vget_low_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[1]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-        _q11 = vld1q_f32(input_ptr5);
-        _q12 = vld1q_f32(input_ptr5 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q7, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q8, vget_high_f32(_ker[2]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q9, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[3]), 1);
-
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[4]), 1);
-
-        vst1q_f32(output_ptr0, _q14);
-        vst1q_f32(output_ptr1, _q15);
-
-        input_ptr0 += 4;
-        input_ptr1 += 4;
-        input_ptr2 += 4;
-        input_ptr3 += 4;
-        input_ptr4 += 4;
-        input_ptr5 += 4;
-        output_ptr0 += 4;
-        output_ptr1 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q15 = vmulq_lane_f32(_q9, vget_low_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[1]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-        _q11 = vld1q_f32(input_ptr5);
-        _q12 = vld1q_f32(input_ptr5 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q7, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q8, vget_high_f32(_ker[2]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q9, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-        _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_ker[3]), 1);
-
-        _q15 = vmlaq_lane_f32(_q15, _q11, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q15 = vmlaq_lane_f32(_q15, _q13, vget_high_f32(_ker[4]), 0);
-        _q15 = vmlaq_lane_f32(_q15, _q12, vget_high_f32(_ker[4]), 1);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _q14, 2);
-            vst1q_lane_f32(output_ptr1 + 2, _q15, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_q14));
-            vst1_f32(output_ptr1, vget_low_f32(_q15));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _q14, 0);
-            vst1q_lane_f32(output_ptr1, _q15, 0);
-            break;
-        }
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        input_ptr3 += output_w_remain;
-        input_ptr4 += output_w_remain;
-        input_ptr5 += output_w_remain;
-        output_ptr0 += output_w_remain;
-        output_ptr1 += output_w_remain;
-      }
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain_%=                 \n"
-          "mov        r0, #16                         \n"
-          "loop_2h4w_%=:                              \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vmul.f32   q15, q9, %e[ker0][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vmla.f32   q15, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vmla.f32   q15, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q15, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-          "vmla.f32   q15, q10, %f[kr0][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vmla.f32   q15, q11, %e[ker0][1]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vmla.f32   q15, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vmla.f32   q15, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q15, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-          "vmla.f32   q15, q12, %f[kr1][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr5]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vmla.f32   q15, q7, %f[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vmla.f32   q15, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vmla.f32   q15, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q15, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-          "vmla.f32   q15, q8, %f[kr2][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vmla.f32   q15, q9, %f[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vmla.f32   q15, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vmla.f32   q15, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-          "vmla.f32   q15, q10, %f[kr3][1]            \n"
-
-          "vmla.f32   q15, q11, %e[ker1][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q15, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q15, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q15, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q12, %f[kr4][1]            \n"
-          // restore output
-          "vst1.32    {q14}, [%[output_ptr0]]!        \n"
-          "vst1.32    {q15}, [%[output_ptr1]]!        \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_2h4w_%=                    \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain], lsl #2           \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vmul.f32   q15, q9, %e[ker0][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vmla.f32   q15, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vmla.f32   q15, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q15, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-          "vmla.f32   q15, q10, %f[kr0][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vmla.f32   q15, q11, %e[ker0][1]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vmla.f32   q15, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vmla.f32   q15, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q15, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-          "vmla.f32   q15, q12, %f[kr1][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr5]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vmla.f32   q15, q7, %f[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vmla.f32   q15, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vmla.f32   q15, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q15, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-          "vmla.f32   q15, q8, %f[kr2][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vmla.f32   q15, q9, %f[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vmla.f32   q15, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vmla.f32   q15, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-          "vmla.f32   q15, q10, %f[kr3][1]            \n"
-
-          "vmla.f32   q15, q11, %e[ker1][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q15, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q15, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q15, q13, %f[kr4][0]            \n"
-          "vmla.f32   q15, q12, %f[kr4][1]            \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_2h1w_%=                   \n"
-          "vst1.32    {d28}, [%[output_ptr0]]!        \n"
-          "vst1.32    {d30}, [%[output_ptr1]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d29[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d31[0]}, [%[output_ptr1]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_2h1w_%=:                             \n"
-          "vst1.32    {d28[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d30[0]}, [%[output_ptr1]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]),
-            [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]),
-            [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6])
-          : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
-            "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t row5 = vld1q_f32(input_ptr5);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0.f;
-            *output_ptr1 = 0.f;
-          } else {
-            int iw = w - valid_w_end;
-            float sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                         input_ptr1[iw] * filter_ptr1[0] +
-                         input_ptr2[iw] * filter_ptr2[0] +
-                         input_ptr3[iw] * filter_ptr3[0] +
-                         input_ptr4[iw] * filter_ptr4[0];
-            float sum1 = input_ptr1[iw] * filter_ptr0[0] +
-                         input_ptr2[iw] * filter_ptr1[0] +
-                         input_ptr3[iw] * filter_ptr2[0] +
-                         input_ptr4[iw] * filter_ptr3[0] +
-                         input_ptr5[iw] * filter_ptr4[0];
-            row0 = vextq_f32(row0, zero, 1);
-            row1 = vextq_f32(row1, zero, 1);
-            row2 = vextq_f32(row2, zero, 1);
-            row3 = vextq_f32(row3, zero, 1);
-            row4 = vextq_f32(row4, zero, 1);
-            row5 = vextq_f32(row5, zero, 1);
-            acc0 = vmulq_f32(row0, _ker[0]);
-            acc0 = vmlaq_f32(acc0, row1, _ker[1]);
-            acc0 = vmlaq_f32(acc0, row2, _ker[2]);
-            acc0 = vmlaq_f32(acc0, row3, _ker[3]);
-            acc0 = vmlaq_f32(acc0, row4, _ker[4]);
-            acc1 = vmulq_f32(row1, _ker[0]);
-            acc1 = vmlaq_f32(acc1, row2, _ker[1]);
-            acc1 = vmlaq_f32(acc1, row3, _ker[2]);
-            acc1 = vmlaq_f32(acc1, row4, _ker[3]);
-            acc1 = vmlaq_f32(acc1, row5, _ker[4]);
-            acc0 = vpaddq_f32(acc0, acc1);
-            float32x2_t sum =
-                vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
-            sum0 += vget_lane_f32(sum, 0);
-            sum1 += vget_lane_f32(sum, 1);
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const float *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const float *input_ptr1 = input_ptr0 + input_w;
-      const float *input_ptr2 = input_ptr1 + input_w;
-      const float *input_ptr3 = input_ptr2 + input_w;
-      const float *input_ptr4 = input_ptr3 + input_w;
-      float *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0.f;
-          } else {
-            acc = vmulq_f32(row0, _ker[0]);
-            acc = vmlaq_f32(acc, row1, _ker[1]);
-            acc = vmlaq_f32(acc, row2, _ker[2]);
-            acc = vmlaq_f32(acc, row3, _ker[3]);
-            acc = vmlaq_f32(acc, row4, _ker[4]);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
-            sum = vpadd_f32(sum, sum);
-            vst1_lane_f32(output_ptr0 + w, sum, 0);
-
-            row0 = vextq_f32(zero, row0, 3);
-            row1 = vextq_f32(zero, row1, 3);
-            row2 = vextq_f32(zero, row2, 3);
-            row3 = vextq_f32(zero, row3, 3);
-            row4 = vextq_f32(zero, row4, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-        // valid
-#if __aarch64__
-      float32x4_t _q14;
-      for (int loop = 0; loop < output_w_tiles; ++loop) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-
-        vst1q_f32(output_ptr0, _q14);
-
-        input_ptr0 += 4;
-        input_ptr1 += 4;
-        input_ptr2 += 4;
-        input_ptr3 += 4;
-        input_ptr4 += 4;
-        output_ptr0 += 4;
-      }
-      // remain w
-      if (output_w_remain > 0) {
-        float32x4_t _q7 = vld1q_f32(input_ptr0);
-        float32x4_t _q8 = vld1q_f32(input_ptr0 + 4);
-        float32x4_t _q9 = vld1q_f32(input_ptr1);
-        float32x4_t _q10 = vld1q_f32(input_ptr1 + 4);
-        float32x4_t _q11 = vld1q_f32(input_ptr2);
-        float32x4_t _q12 = vld1q_f32(input_ptr2 + 4);
-
-        _q14 = vmulq_lane_f32(_q7, vget_low_f32(_ker[5]), 0);
-        float32x4_t _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[0]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[0]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[0]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[1]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[1]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[1]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q11, vget_high_f32(_ker[5]), 0);
-        _q13 = vextq_f32(_q11, _q12, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 0);
-        _q13 = vextq_f32(_q11, _q12, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[2]), 1);
-        _q13 = vextq_f32(_q11, _q12, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[2]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q12, vget_high_f32(_ker[2]), 1);
-
-        _q7 = vld1q_f32(input_ptr3);
-        _q8 = vld1q_f32(input_ptr3 + 4);
-        _q9 = vld1q_f32(input_ptr4);
-        _q10 = vld1q_f32(input_ptr4 + 4);
-
-        _q14 = vmlaq_lane_f32(_q14, _q7, vget_high_f32(_ker[5]), 1);
-        _q13 = vextq_f32(_q7, _q8, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 0);
-        _q13 = vextq_f32(_q7, _q8, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[3]), 1);
-        _q13 = vextq_f32(_q7, _q8, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[3]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q8, vget_high_f32(_ker[3]), 1);
-
-        _q14 = vmlaq_lane_f32(_q14, _q9, vget_low_f32(_ker[6]), 0);
-        _q13 = vextq_f32(_q9, _q10, 1);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 0);
-        _q13 = vextq_f32(_q9, _q10, 2);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_low_f32(_ker[4]), 1);
-        _q13 = vextq_f32(_q9, _q10, 3);
-        _q14 = vmlaq_lane_f32(_q14, _q13, vget_high_f32(_ker[4]), 0);
-        _q14 = vmlaq_lane_f32(_q14, _q10, vget_high_f32(_ker[4]), 1);
-
-        switch (output_w_remain) {
-          case 3:
-            vst1q_lane_f32(output_ptr0 + 2, _q14, 2);
-          case 2:
-            vst1_f32(output_ptr0, vget_low_f32(_q14));
-            break;
-          case 1:
-            vst1q_lane_f32(output_ptr0, _q14, 0);
-            break;
-        }
-
-        input_ptr0 += output_w_remain;
-        input_ptr1 += output_w_remain;
-        input_ptr2 += output_w_remain;
-        input_ptr3 += output_w_remain;
-        input_ptr4 += output_w_remain;
-        output_ptr0 += output_w_remain;
-      }
-#else
-      int loop = output_w_tiles;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain_%=                 \n"
-          "mov        r0, #16                         \n"
-          "loop_1h4w_%=:                              \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-
-          // restore output
-          "vst1.32    {q14}, [%[output_ptr0]]!        \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_1h4w_%=                    \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain], lsl #2           \n"
-          "vld1.32    {d14-d17}, [%[input_ptr0]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr1]], r0  \n"
-          "vld1.32    {d22-d25}, [%[input_ptr2]], r0  \n"
-          "vmul.f32   q14, q7, %e[ker0][0]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr0][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr0][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr0][0]            \n"
-          "vmla.f32   q14, q8, %f[kr0][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker0][1]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr1][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr1][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr1][0]            \n"
-          "vmla.f32   q14, q10, %f[kr1][1]            \n"
-
-          "vmla.f32   q14, q11, %f[ker0][0]           \n"
-          "vext.32    q13, q11, q12, #1               \n"
-          "vmla.f32   q14, q13, %e[kr2][0]            \n"
-          "vext.32    q13, q11, q12, #2               \n"
-          "vmla.f32   q14, q13, %e[kr2][1]            \n"
-          "vext.32    q13, q11, q12, #3               \n"
-          "vmla.f32   q14, q13, %f[kr2][0]            \n"
-          "vmla.f32   q14, q12, %f[kr2][1]            \n"
-
-          "vld1.32    {d14-d17}, [%[input_ptr3]], r0  \n"
-          "vld1.32    {d18-d21}, [%[input_ptr4]], r0  \n"
-          "vmla.f32   q14, q7, %f[ker0][1]            \n"
-          "vext.32    q13, q7, q8, #1                 \n"
-          "vmla.f32   q14, q13, %e[kr3][0]            \n"
-          "vext.32    q13, q7, q8, #2                 \n"
-          "vmla.f32   q14, q13, %e[kr3][1]            \n"
-          "vext.32    q13, q7, q8, #3                 \n"
-          "vmla.f32   q14, q13, %f[kr3][0]            \n"
-          "vmla.f32   q14, q8, %f[kr3][1]             \n"
-
-          "vmla.f32   q14, q9, %e[ker1][0]            \n"
-          "vext.32    q13, q9, q10, #1                \n"
-          "vmla.f32   q14, q13, %e[kr4][0]            \n"
-          "vext.32    q13, q9, q10, #2                \n"
-          "vmla.f32   q14, q13, %e[kr4][1]            \n"
-          "vext.32    q13, q9, q10, #3                \n"
-          "vmla.f32   q14, q13, %f[kr4][0]            \n"
-          "vmla.f32   q14, q10, %f[kr4][1]            \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_1h1w_%=                   \n"
-          "vst1.32    {d28}, [%[output_ptr0]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d29[0]}, [%[output_ptr0]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_1h1w_%=:                             \n"
-          "vst1.32    {d28[0]}, [%[output_ptr0]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0),
-            [loop] "+r"(loop)
-          : [remain] "r"(output_w_remain), [kr0] "w"(_ker[0]),
-            [kr1] "w"(_ker[1]), [kr2] "w"(_ker[2]), [kr3] "w"(_ker[3]),
-            [kr4] "w"(_ker[4]), [ker0] "w"(_ker[5]), [ker1] "w"(_ker[6])
-          : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14",
-            "q15", "r0");
-#endif  // __aarch64__
-      // pad right
-      if (padding_w) {
-        float32x4_t row0 = vld1q_f32(input_ptr0);
-        float32x4_t row1 = vld1q_f32(input_ptr1);
-        float32x4_t row2 = vld1q_f32(input_ptr2);
-        float32x4_t row3 = vld1q_f32(input_ptr3);
-        float32x4_t row4 = vld1q_f32(input_ptr4);
-        float32x4_t zero = vdupq_n_f32(0.f);
-        float32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0.f;
-          } else {
-            int iw = w - valid_w_end;
-            float sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                         input_ptr1[iw] * filter_ptr1[0] +
-                         input_ptr2[iw] * filter_ptr2[0] +
-                         input_ptr3[iw] * filter_ptr3[0] +
-                         input_ptr4[iw] * filter_ptr4[0];
-            row0 = vextq_f32(row0, zero, 1);
-            row1 = vextq_f32(row1, zero, 1);
-            row2 = vextq_f32(row2, zero, 1);
-            row3 = vextq_f32(row3, zero, 1);
-            row4 = vextq_f32(row4, zero, 1);
-            acc = vmulq_f32(row0, _ker[0]);
-            acc = vmlaq_f32(acc, row1, _ker[1]);
-            acc = vmlaq_f32(acc, row2, _ker[2]);
-            acc = vmlaq_f32(acc, row3, _ker[3]);
-            acc = vmlaq_f32(acc, row4, _ker[4]);
-            float32x2_t sum = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
-            sum = vpadd_f32(sum, sum);
-            sum0 += vget_lane_f32(sum, 0);
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, _ker1);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv5x5S2<float, float>(const framework::Tensor &input,
-                                      const framework::Tensor &filter,
-                                      const std::vector<int> &paddings,
-                                      framework::Tensor *output) {}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/depthwise_conv5x5.h b/mobile/src/operators/math/depthwise_conv5x5.h
deleted file mode 100644
index 11d96b078ac7314ef0f3de98614c1e4ebd4dbc95..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise_conv5x5.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// TODO(hjchen2) need to be implemented
-// template<typename Itype, typename Otype>
-// void DepthwiseConv5x5(const framework::Tensor *input,
-//                      const framework::Tensor *filter,
-//                      const std::vector<int> &strides,
-//                      const std::vector<int> &paddings,
-//                      framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5S1(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void DepthwiseConv5x5S2(const framework::Tensor &input,
-                        const framework::Tensor &filter,
-                        const std::vector<int> &paddings,
-                        framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp b/mobile/src/operators/math/depthwise_conv5x5_int8.cpp
deleted file mode 100644
index 1e9482beb4d0f46532becc5fa86fc6590e7790aa..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/depthwise_conv5x5_int8.cpp
+++ /dev/null
@@ -1,1041 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) && !defined(__aarch64__)
-
-#include <arm_neon.h>
-#include "operators/math/depthwise_conv5x5.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifndef __aarch64__
-inline int32x4_t vpaddq_s32(int32x4_t r0, int32x4_t r1) {
-  int32x2_t sum0 = vpadd_s32(vget_low_s32(r0), vget_high_s32(r0));
-  int32x2_t sum1 = vpadd_s32(vget_low_s32(r1), vget_high_s32(r1));
-  return vcombine_s32(sum0, sum1);
-}
-#endif
-
-template <int Stride = 1>
-inline void Depth5x5NormalRowLoadInput(const int8_t *input, int16x4_t *y) {
-  int16x8_t x = vmovl_s8(vld1_s8(input));
-  y[0] = vget_low_s16(x);
-  y[4] = vget_high_s16(x);
-  y[1] = vext_s16(y[0], y[4], 1);
-  y[2] = vext_s16(y[0], y[4], 2);
-  y[3] = vext_s16(y[0], y[4], 3);
-}
-
-template <>
-inline void Depth5x5NormalRowLoadInput<2>(const int8_t *input, int16x4_t *y) {
-  int8x8x2_t x = vld2_s8(input);
-  y[0] = vget_low_s16(vmovl_s8(x.val[0]));
-  y[1] = vget_low_s16(vmovl_s8(x.val[1]));
-  y[2] = vext_s16(y[0], y[0], 1);
-  y[3] = vext_s16(y[1], y[1], 1);
-  y[4] = vext_s16(y[0], y[0], 2);
-}
-
-#define DEPTHWISE_CONV_NORMAL_BORDER(start, end)                         \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride_w;                    \
-    const int w_in_end = w_in_start + 5;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    int32_t value = 0;                                                   \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        value += filter[(h_in - h_in_start) * 5 + (w_in - w_in_start)] * \
-                 input[h_in * input_w + w_in];                           \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = value;                                               \
-  }
-
-template <int Stride_h, int Stride_w>
-inline void DepthwiseConv5x5NormalRow(const int8_t *input, const int8_t *filter,
-                                      const int h_output, const int input_h,
-                                      const int input_w, const int padding_h,
-                                      const int padding_w, const int output_w,
-                                      int32_t *output, int16x4_t *ker,
-                                      int16_t *ker1) {
-  const int h_in_start = -padding_h + h_output * Stride_h;
-  const int h_in_end = h_in_start + 5;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  int valid_w_start = (padding_w + Stride_w - 1) / Stride_w;
-  int valid_w_end = output_w - valid_w_start;
-  int32_t *output_ptr = output + h_output * output_w;
-  // border left
-  DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) >> 2;
-  int16x4_t _x[5];
-  int32x4_t _sum;
-  // valid w
-  for (int w = 0; w < output_tiles * 4; w += 4) {
-    _sum = vdupq_n_s32(0);
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride_w - padding_w;
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlal_n_s16(_sum, _x[0], ker1[index]);
-      _sum = vmlal_lane_s16(_sum, _x[1], ker[index], 0);
-      _sum = vmlal_lane_s16(_sum, _x[2], ker[index], 1);
-      _sum = vmlal_lane_s16(_sum, _x[3], ker[index], 2);
-      _sum = vmlal_lane_s16(_sum, _x[4], ker[index], 3);
-    }
-    vst1q_s32(output_ptr + output_offset, _sum);
-  }
-  // remain valid w
-  int remain = (valid_w_end - valid_w_start) & 0x3;
-  if (remain > 0) {
-    _sum = vdupq_n_s32(0);
-    int remain_start = valid_w_start + (output_tiles << 2);
-    int input_w_offset = remain_start * Stride_w - padding_w;
-    int32_t *output_ptr0 = output_ptr + remain_start;
-
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      int index = h_in - h_in_start;
-      Depth5x5NormalRowLoadInput<Stride_w>(
-          input + h_in * input_w + input_w_offset, _x);
-      _sum = vmlal_n_s16(_sum, _x[0], ker1[index]);
-      _sum = vmlal_lane_s16(_sum, _x[1], ker[index], 0);
-      _sum = vmlal_lane_s16(_sum, _x[2], ker[index], 1);
-      _sum = vmlal_lane_s16(_sum, _x[3], ker[index], 2);
-      _sum = vmlal_lane_s16(_sum, _x[4], ker[index], 3);
-    }
-    switch (remain) {
-      case 1:
-        vst1_lane_s32(output_ptr0, vget_low_s32(_sum), 0);
-        break;
-      case 2:
-        vst1_s32(output_ptr0, vget_low_s32(_sum));
-        break;
-      case 3:
-        vst1_s32(output_ptr0, vget_low_s32(_sum));
-        vst1_lane_s32(output_ptr0 + 2, vget_high_s32(_sum), 0);
-        break;
-    }
-  }
-  // border right
-  DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <>
-void DepthwiseConv5x5S1<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {
-  const int8_t *input_data = input.data<int8_t>();
-  const int8_t *filter_data = filter.data<int8_t>();
-  int32_t *out_data = output->mutable_data<int32_t>();
-  int input_h = input.dims()[2];
-  int input_w = input.dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  int padding_h = paddings[0];
-  int padding_w = paddings[1];
-  int image_size = input_h * input_w;
-  int out_image_size = output_h * output_w;
-  int valid_h_start = padding_h;
-  int valid_h_end = output_h - valid_h_start;
-  int valid_h = valid_h_end - valid_h_start;
-  int valid_w_start = padding_w;
-  int valid_w_end = output_w - valid_w_start;
-  int valid_w = valid_w_end - valid_w_start;
-
-  #pragma omp parallel for
-  for (int g = 0; g < input.dims()[1]; ++g) {
-    const int8_t *input_ptr = input_data + g * image_size;
-    const int8_t *filter_ptr = filter_data + g * 25;
-    int32_t *output_ptr = out_data + g * out_image_size;
-
-    const int8_t *filter_ptr0 = filter_ptr;
-    const int8_t *filter_ptr1 = filter_ptr0 + 5;
-    const int8_t *filter_ptr2 = filter_ptr1 + 5;
-    const int8_t *filter_ptr3 = filter_ptr2 + 5;
-    const int8_t *filter_ptr4 = filter_ptr3 + 5;
-    int16_t kernel[5] = {*filter_ptr0, *filter_ptr1, *filter_ptr2, *filter_ptr3,
-                         *filter_ptr4};
-    int16x4_t _k0 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr0 + 1)));
-    int16x4_t _k1 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr1 + 1)));
-    int16x4_t _k2 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr2 + 1)));
-    int16x4_t _k3 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr3 + 1)));
-    int16x4_t _k4 = vget_low_s16(vmovl_s8(vld1_s8(filter_ptr4 + 1)));
-    int16x4_t _k5 = vld1_s16(kernel);
-    int16x4_t _k6 = vld1_s16(kernel + 4);
-    int16x8_t _ker0 = vcombine_s16(_k0, _k1);
-    int16x8_t _ker1 = vcombine_s16(_k2, _k3);
-    int16x8_t _ker2 = vcombine_s16(_k4, _k5);
-    int16x8_t _ker3 = vcombine_s16(_k6, _k6);
-    int16x4_t _ker[7] = {_k0, _k1, _k2, _k3, _k4, _k5, _k6};
-
-    // pad top
-    for (int h = 0; h < valid_h_start; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, kernel);
-    }
-
-    // output 4x4
-    int output_w_tiles = valid_w / 8;
-    int output_w_remain = valid_w - output_w_tiles * 8;
-    for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-      const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      const int8_t *input_ptr5 = input_ptr4 + input_w;
-      int32_t *output_ptr0 = output_ptr + h * output_w;
-      int32_t *output_ptr1 = output_ptr0 + output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc0, acc1;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0;
-            output_ptr1[w] = 0;
-          } else {
-            acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            acc0 = vmlal_s16(acc0, row3, _ker[3]);
-            acc0 = vmlal_s16(acc0, row4, _ker[4]);
-            acc1 = vmull_s16(row1, _ker[0]);
-            acc1 = vmlal_s16(acc1, row2, _ker[1]);
-            acc1 = vmlal_s16(acc1, row3, _ker[2]);
-            acc1 = vmlal_s16(acc1, row4, _ker[3]);
-            acc1 = vmlal_s16(acc1, row5, _ker[4]);
-            acc0 = vpaddq_s32(acc0, acc1);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
-            vst1_lane_s32(output_ptr0 + w, sum, 0);
-            vst1_lane_s32(output_ptr1 + w, sum, 1);
-
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            row4 = vext_s16(zero, row4, 3);
-            row5 = vext_s16(zero, row5, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-        output_ptr1 += valid_w_start;
-      }
-      // valid
-      int loop = output_w_tiles;
-      int w_remain = output_w_remain;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain4_%=                \n"
-          "mov        r0, #8                          \n"
-          "loop_2h8w_%=:                              \n"
-          "vld1.s8    {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vld1.s8    {d14-d15}, [%[input_ptr2]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vmull.s16  q13, d17, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][1]           \n"
-          "vmull.s16  q14, d16, %f[ker2][0]           \n"
-          "vmull.s16  q15, d17, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][0]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][1]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][2]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][3]           \n"
-          "vmlal.s16  q15, d21, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][2]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][1]           \n"
-          "vmlal.s16  q15, d17, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q15, d21, %f[ker0][3]           \n"
-
-          "vld1.s8    {d10-d11}, [%[input_ptr3]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr4]], r0  \n"
-          "vld1.s8    {d14-d15}, [%[input_ptr5]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][3]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q15, d17, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q15, d21, %e[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker3][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q15, d17, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q15, d21, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q14, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q15, d17, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q15, d21, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {q12-q13}, [%[output_ptr0]]!    \n"
-          "vst1.32    {q14-q15}, [%[output_ptr1]]!    \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_2h8w_%=                    \n"
-
-          "start_remain4_%=:                          \n"
-          "cmp        %[remain], #4                   \n"
-          "blt        start_remain_%=                 \n"
-          "mov        r0, #4                          \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmull.s16  q14, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr5]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q14, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {d24-d25}, [%[output_ptr0]]!    \n"
-          "vst1.32    {d28-d29}, [%[output_ptr1]]!    \n"
-          "sub        %[remain], #4                   \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain]                   \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmull.s16  q14, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker0][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr5]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q14, d20, %e[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q14, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q14, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q14, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q14, d20, %e[ker2][3]           \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_2h1w_%=                   \n"
-          "vst1.32    {d24}, [%[output_ptr0]]!        \n"
-          "vst1.32    {d28}, [%[output_ptr1]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d25[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d29[0]}, [%[output_ptr1]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_2h1w_%=:                             \n"
-          "vst1.32    {d24[0]}, [%[output_ptr0]]!     \n"
-          "vst1.32    {d28[0]}, [%[output_ptr1]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5),
-            [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1),
-            [loop] "+r"(loop), [remain] "+r"(w_remain)
-          : [ker0] "w"(_ker0), [ker1] "w"(_ker1), [ker2] "w"(_ker2),
-            [ker3] "w"(_ker3)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t row5 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr5)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc0, acc1;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0;
-            *output_ptr1 = 0;
-          } else {
-            int iw = w - valid_w_end;
-            int32_t sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                           input_ptr1[iw] * filter_ptr1[0] +
-                           input_ptr2[iw] * filter_ptr2[0] +
-                           input_ptr3[iw] * filter_ptr3[0] +
-                           input_ptr4[iw] * filter_ptr4[0];
-            int32_t sum1 = input_ptr1[iw] * filter_ptr0[0] +
-                           input_ptr2[iw] * filter_ptr1[0] +
-                           input_ptr3[iw] * filter_ptr2[0] +
-                           input_ptr4[iw] * filter_ptr3[0] +
-                           input_ptr5[iw] * filter_ptr4[0];
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-            row4 = vext_s16(row4, zero, 1);
-            row5 = vext_s16(row5, zero, 1);
-            acc0 = vmull_s16(row0, _ker[0]);
-            acc0 = vmlal_s16(acc0, row1, _ker[1]);
-            acc0 = vmlal_s16(acc0, row2, _ker[2]);
-            acc0 = vmlal_s16(acc0, row3, _ker[3]);
-            acc0 = vmlal_s16(acc0, row4, _ker[4]);
-            acc1 = vmull_s16(row1, _ker[0]);
-            acc1 = vmlal_s16(acc1, row2, _ker[1]);
-            acc1 = vmlal_s16(acc1, row3, _ker[2]);
-            acc1 = vmlal_s16(acc1, row4, _ker[3]);
-            acc1 = vmlal_s16(acc1, row5, _ker[4]);
-            acc0 = vpaddq_s32(acc0, acc1);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc0), vget_high_s32(acc0));
-            sum0 += vget_lane_s32(sum, 0);
-            sum1 += vget_lane_s32(sum, 1);
-            *output_ptr0 = sum0;
-            *output_ptr1 = sum1;
-          }
-          output_ptr0++;
-          output_ptr1++;
-        }
-      }
-    }
-    // remain height
-    int start_h = valid_h_start + (valid_h & 0xfffffffe);
-    if (start_h < valid_h_end) {
-      const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w;
-      const int8_t *input_ptr1 = input_ptr0 + input_w;
-      const int8_t *input_ptr2 = input_ptr1 + input_w;
-      const int8_t *input_ptr3 = input_ptr2 + input_w;
-      const int8_t *input_ptr4 = input_ptr3 + input_w;
-      int32_t *output_ptr0 = output_ptr + start_h * output_w;
-      // pad left
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc;
-        for (int w = valid_w_start - 1; w >= 0; --w) {
-          int padding = padding_w - w;
-          if (padding >= 5) {
-            output_ptr0[w] = 0;
-          } else {
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            acc = vmlal_s16(acc, row3, _ker[3]);
-            acc = vmlal_s16(acc, row4, _ker[4]);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc), vget_high_s32(acc));
-            sum = vpadd_s32(sum, sum);
-            vst1_lane_s32(output_ptr0 + w, sum, 0);
-
-            row0 = vext_s16(zero, row0, 3);
-            row1 = vext_s16(zero, row1, 3);
-            row2 = vext_s16(zero, row2, 3);
-            row3 = vext_s16(zero, row3, 3);
-            row4 = vext_s16(zero, row4, 3);
-          }
-        }
-        output_ptr0 += valid_w_start;
-      }
-      // valid
-      int loop = output_w_tiles;
-      int w_remain = output_w_remain;
-      asm volatile(
-          "cmp        %[loop], #0                     \n"
-          "ble        start_remain4_%=                \n"
-          "mov        r0, #8                          \n"
-          "loop_1h8w_%=:                              \n"
-          "vld1.s8    {d10-d11}, [%[input_ptr0]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr1]], r0  \n"
-          "vld1.s8    {d14-d15}, [%[input_ptr2]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vmull.s16  q13, d17, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmovl.s8   q9, d15                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker1][3]           \n"
-
-          "vld1.s8    {d10-d11}, [%[input_ptr3]], r0  \n"
-          "vld1.s8    {d12-d13}, [%[input_ptr4]], r0  \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmovl.s8   q9, d11                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vmlal.s16  q13, d17, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-          "vmlal.s16  q13, d21, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmovl.s8   q9, d13                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vmlal.s16  q13, d17, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-          "vmlal.s16  q13, d21, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {q12-q13}, [%[output_ptr0]]!    \n"
-          "subs       %[loop], #1                     \n"
-          "bne        loop_1h8w_%=                    \n"
-
-          "start_remain4_%=:                          \n"
-          "cmp        %[remain], #4                   \n"
-          "blt        start_remain_%=                 \n"
-          "mov        r0, #4                          \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-
-          // restore output
-          "vst1.32    {d24-d25}, [%[output_ptr0]]!    \n"
-          "sub        %[remain], #4                   \n"
-
-          "start_remain_%=:                           \n"
-          "cmp        %[remain], #0                   \n"
-          "ble        end_%=                          \n"
-          "mov        r0, %[remain]                   \n"
-          "vld1.s8    {d10}, [%[input_ptr0]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr1]], r0      \n"
-          "vld1.s8    {d14}, [%[input_ptr2]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmull.s16  q12, d16, %f[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker0][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker0][3]           \n"
-
-          "vmovl.s8   q8, d14                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker1][3]           \n"
-
-          "vld1.s8    {d10}, [%[input_ptr3]], r0      \n"
-          "vld1.s8    {d12}, [%[input_ptr4]], r0      \n"
-          "vmovl.s8   q8, d10                         \n"
-          "vmlal.s16  q12, d16, %f[ker2][3]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %f[ker1][3]           \n"
-
-          "vmovl.s8   q8, d12                         \n"
-          "vmlal.s16  q12, d16, %e[ker3][0]           \n"
-          "vext.s16   q10, q8, q9, #1                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][0]           \n"
-          "vext.s16   q10, q8, q9, #2                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][1]           \n"
-          "vext.s16   q10, q8, q9, #3                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][2]           \n"
-          "vext.s16   q10, q8, q9, #4                 \n"
-          "vmlal.s16  q12, d20, %e[ker2][3]           \n"
-
-          "cmp        %[remain], #2                   \n"
-          "blt        store_1h1w_%=                   \n"
-          "vst1.32    {d24}, [%[output_ptr0]]!        \n"
-          "cmp        %[remain], #3                   \n"
-          "blt        end_%=                          \n"
-          "vst1.32    {d25[0]}, [%[output_ptr0]]!     \n"
-          "b          end_%=                          \n"
-
-          "store_1h1w_%=:                             \n"
-          "vst1.32    {d24[0]}, [%[output_ptr0]]!     \n"
-          "end_%=:                                    \n"
-          : [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1),
-            [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3),
-            [input_ptr4] "+r"(input_ptr4), [output_ptr0] "+r"(output_ptr0),
-            [loop] "+r"(loop), [remain] "+r"(w_remain)
-          : [ker0] "w"(_ker0), [ker1] "w"(_ker1), [ker2] "w"(_ker2),
-            [ker3] "w"(_ker3)
-          : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11",
-            "q12", "q13", "q14", "q15", "r0");
-      // pad right
-      if (padding_w) {
-        int16x4_t row0 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr0)));
-        int16x4_t row1 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr1)));
-        int16x4_t row2 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr2)));
-        int16x4_t row3 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr3)));
-        int16x4_t row4 = vget_low_s16(vmovl_s8(vld1_s8(input_ptr4)));
-        int16x4_t zero = vdup_n_s16(0);
-        int32x4_t acc;
-        for (int w = valid_w_end; w < output_w; ++w) {
-          int padding = w + 5 - (padding_w + input_w);
-          if (padding >= 5) {
-            *output_ptr0 = 0;
-          } else {
-            int iw = w - valid_w_end;
-            int32_t sum0 = input_ptr0[iw] * filter_ptr0[0] +
-                           input_ptr1[iw] * filter_ptr1[0] +
-                           input_ptr2[iw] * filter_ptr2[0] +
-                           input_ptr3[iw] * filter_ptr3[0] +
-                           input_ptr4[iw] * filter_ptr4[0];
-            row0 = vext_s16(row0, zero, 1);
-            row1 = vext_s16(row1, zero, 1);
-            row2 = vext_s16(row2, zero, 1);
-            row3 = vext_s16(row3, zero, 1);
-            row4 = vext_s16(row4, zero, 1);
-            acc = vmull_s16(row0, _ker[0]);
-            acc = vmlal_s16(acc, row1, _ker[1]);
-            acc = vmlal_s16(acc, row2, _ker[2]);
-            acc = vmlal_s16(acc, row3, _ker[3]);
-            acc = vmlal_s16(acc, row4, _ker[4]);
-            int32x2_t sum = vpadd_s32(vget_low_s32(acc), vget_high_s32(acc));
-            sum = vpadd_s32(sum, sum);
-            sum0 += vget_lane_s32(sum, 0);
-            *output_ptr0 = sum0;
-          }
-          output_ptr0++;
-        }
-      }
-    }
-    // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
-      DepthwiseConv5x5NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
-                                      input_w, padding_h, padding_w, output_w,
-                                      output_ptr, _ker, kernel);
-    }
-  }
-}
-
-template <>
-void DepthwiseConv5x5S2<int8_t, int32_t>(const framework::Tensor &input,
-                                         const framework::Tensor &filter,
-                                         const std::vector<int> &paddings,
-                                         framework::Tensor *output) {}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/element_wise.h b/mobile/src/operators/math/element_wise.h
deleted file mode 100644
index f81931930f21a8d041ea045817af0afff2430194..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/element_wise.h
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "framework/tensor.h"
-#include "operators/math/activation.h"
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <ActivationType Act>
-void AddChannelWise(const framework::Tensor *input,
-                    const framework::Tensor *bias, framework::Tensor *output) {
-  const float *input_ptr = input->data<float>();
-  const float *bias_ptr = bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  // maybe check shape
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int spatial_size = input->dims()[2] * input->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * spatial_size;
-      const float *x = input_ptr + offset;
-      float *y = output_ptr + offset;
-      float beta = bias_ptr[channel];
-      int j = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      float32x4_t __bias = vdupq_n_f32(beta);
-      for (; j < spatial_size - 15; j += 16, x += 16, y += 16) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t in1 = vld1q_f32(x + 4);
-        float32x4_t in2 = vld1q_f32(x + 8);
-        float32x4_t in3 = vld1q_f32(x + 12);
-        in0 = vaddq_f32(__bias, in0);
-        in1 = vaddq_f32(__bias, in1);
-        in2 = vaddq_f32(__bias, in2);
-        in3 = vaddq_f32(__bias, in3);
-        in0 = math::vActiveq_f32<Act>(in0);
-        in1 = math::vActiveq_f32<Act>(in1);
-        in2 = math::vActiveq_f32<Act>(in2);
-        in3 = math::vActiveq_f32<Act>(in3);
-        vst1q_f32(y, in0);
-        vst1q_f32(y + 4, in1);
-        vst1q_f32(y + 8, in2);
-        vst1q_f32(y + 12, in3);
-      }
-      for (; j < spatial_size - 3; j += 4, x += 4, y += 4) {
-        float32x4_t in0 = vld1q_f32(x);
-        in0 = vaddq_f32(__bias, in0);
-        in0 = math::vActiveq_f32<Act>(in0);
-        vst1q_f32(y, in0);
-      }
-#endif
-      for (; j < spatial_size; ++j, ++x, ++y) {
-        *y = math::Active<Act>((*x) + beta);
-      }
-    }
-  }
-}
-
-template <ActivationType Act>
-void ScaleAddChannelWise(const framework::Tensor *input,
-                         const framework::Tensor *scale,
-                         const framework::Tensor *bias,
-                         framework::Tensor *output) {
-  const float *input_ptr = input->data<float>();
-  const float *scale_ptr = scale->data<float>();
-  const float *bias_ptr = bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  // maybe check shape
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int spatial_size = input->dims()[2] * input->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * spatial_size;
-      const float *x = input_ptr + offset;
-      float *y = output_ptr + offset;
-      float alpha = scale_ptr[channel];
-      float beta = bias_ptr[channel];
-      int j = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      float32x4_t __scale = vdupq_n_f32(alpha);
-      float32x4_t __bias = vdupq_n_f32(beta);
-      for (; j < spatial_size - 15; j += 16, x += 16, y += 16) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t in1 = vld1q_f32(x + 4);
-        float32x4_t in2 = vld1q_f32(x + 8);
-        float32x4_t in3 = vld1q_f32(x + 12);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in1 = vmlaq_f32(__bias, __scale, in1);
-        in2 = vmlaq_f32(__bias, __scale, in2);
-        in3 = vmlaq_f32(__bias, __scale, in3);
-        in0 = math::vActiveq_f32<Act>(in0);
-        in1 = math::vActiveq_f32<Act>(in1);
-        in2 = math::vActiveq_f32<Act>(in2);
-        in3 = math::vActiveq_f32<Act>(in3);
-        vst1q_f32(y, in0);
-        vst1q_f32(y + 4, in1);
-        vst1q_f32(y + 8, in2);
-        vst1q_f32(y + 12, in3);
-      }
-      for (; j < spatial_size - 3; j += 4, x += 4, y += 4) {
-        float32x4_t in0 = vld1q_f32(x);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in0 = math::vActiveq_f32<Act>(in0);
-        vst1q_f32(y, in0);
-      }
-#endif
-      for (; j < spatial_size; ++j, ++x, ++y) {
-        *y = math::Active<Act>(alpha * (*x) + beta);
-      }
-    }
-  }
-}
-
-template <ActivationType Act>
-void ScaleAddChannelWise(const framework::Tensor *input,
-                         const framework::Tensor *scale,
-                         const framework::Tensor *bias,
-                         const framework::Tensor *tensorwise_bias,
-                         framework::Tensor *output) {
-  const float *input_ptr = input->data<float>();
-  const float *scale_ptr = scale->data<float>();
-  const float *bias_ptr = bias->data<float>();
-  const float *tensorwise_bias_ptr = tensorwise_bias->data<float>();
-  float *output_ptr = output->mutable_data<float>();
-  // maybe check shape
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int spatial_size = input->dims()[2] * input->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * spatial_size;
-      const float *x = input_ptr + offset;
-      const float *b = tensorwise_bias_ptr + offset;
-      float *y = output_ptr + offset;
-      float alpha = scale_ptr[channel];
-      float beta = bias_ptr[channel];
-      int j = 0;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-      float32x4_t __scale = vdupq_n_f32(alpha);
-      float32x4_t __bias = vdupq_n_f32(beta);
-      for (; j < spatial_size - 15; j += 16, x += 16, b += 16, y += 16) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t in1 = vld1q_f32(x + 4);
-        float32x4_t in2 = vld1q_f32(x + 8);
-        float32x4_t in3 = vld1q_f32(x + 12);
-        float32x4_t b0 = vld1q_f32(b);
-        float32x4_t b1 = vld1q_f32(b + 4);
-        float32x4_t b2 = vld1q_f32(b + 8);
-        float32x4_t b3 = vld1q_f32(b + 12);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in1 = vmlaq_f32(__bias, __scale, in1);
-        in2 = vmlaq_f32(__bias, __scale, in2);
-        in3 = vmlaq_f32(__bias, __scale, in3);
-        in0 = vaddq_f32(in0, b0);
-        in1 = vaddq_f32(in1, b1);
-        in2 = vaddq_f32(in2, b2);
-        in3 = vaddq_f32(in3, b3);
-        in0 = math::vActiveq_f32<Act>(in0);
-        in1 = math::vActiveq_f32<Act>(in1);
-        in2 = math::vActiveq_f32<Act>(in2);
-        in3 = math::vActiveq_f32<Act>(in3);
-        vst1q_f32(y, in0);
-        vst1q_f32(y + 4, in1);
-        vst1q_f32(y + 8, in2);
-        vst1q_f32(y + 12, in3);
-      }
-      for (; j < spatial_size - 3; j += 4, x += 4, b += 4, y += 4) {
-        float32x4_t in0 = vld1q_f32(x);
-        float32x4_t b0 = vld1q_f32(b);
-        in0 = vmlaq_f32(__bias, __scale, in0);
-        in0 = vaddq_f32(in0, b0);
-        in0 = math::vActiveq_f32<Act>(in0);
-        vst1q_f32(y, in0);
-      }
-#endif
-      for (; j < spatial_size; ++j, ++x, ++b, ++y) {
-        *y = math::Active<Act>(alpha * (*x) + beta + (*b));
-      }
-    }
-  }
-}
-
-template <ActivationType Act>
-void AddElememtWise(const framework::Tensor *input,
-                    const framework::Tensor *bias, const int axis,
-                    framework::Tensor *output) {
-  const auto &x_dims = input->dims();
-  const auto &y_dims = bias->dims();
-  const float *input_data = input->data<float>();
-  const float *bias_data = bias->data<float>();
-  float *output_data = output->mutable_data<float>();
-
-  if (x_dims == y_dims) {
-    size_t channels = 1;
-    size_t elementwise_num = 1;
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-#pragma omp parallel for
-    for (int j = 0; j < channels; ++j) {
-      size_t offset = (0 * channels + j) * elementwise_num;
-      const float *input = input_data + offset;
-      const float bias = bias_data[j];
-      float *output = output_data + offset;
-#if 0
-      int loop = elementwise_num >> 0x4;
-      int remain = elementwise_num & 0xF;
-      float32x4_t rb = vdupq_n_f32(bias);
-      for (int k = 0; k < loop; ++k) {
-        float32x4_t r0 = vld1q_f32(input);
-        float32x4_t r1 = vld1q_f32(input + 4);
-        float32x4_t r2 = vld1q_f32(input + 8);
-        float32x4_t r3 = vld1q_f32(input + 12);
-        r0 = vaddq_f32(r0, rb);
-        r1 = vaddq_f32(r1, rb);
-        r2 = vaddq_f32(r2, rb);
-        r3 = vaddq_f32(r3, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        r1 = math::vActiveq_f32<Act>(r1);
-        r2 = math::vActiveq_f32<Act>(r2);
-        r3 = math::vActiveq_f32<Act>(r3);
-        vst1q_f32(output, r0);
-        vst1q_f32(output + 4, r1);
-        vst1q_f32(output + 8, r2);
-        vst1q_f32(output + 12, r3);
-        input += 16;
-        output += 16;
-      }
-      if (remain >= 8) {
-        float32x4_t r0 = vld1q_f32(input);
-        float32x4_t r1 = vld1q_f32(input + 4);
-        r0 = vaddq_f32(r0, rb);
-        r1 = vaddq_f32(r1, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        r1 = math::vActiveq_f32<Act>(r1);
-        vst1q_f32(output, r0);
-        vst1q_f32(output + 4, r1);
-        input += 8;
-        output += 8;
-        remain -= 8;
-      }
-      if (remain >= 4) {
-        float32x4_t r0 = vld1q_f32(input);
-        r0 = vaddq_f32(r0, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        vst1q_f32(output, r0);
-        input += 4;
-        output += 4;
-        remain -= 4;
-      }
-      if (remain > 0) {
-        float32x4_t r0 = vld1q_f32(input);
-        r0 = vaddq_f32(r0, rb);
-        r0 = math::vActiveq_f32<Act>(r0);
-        switch (remain) {
-          case 1:
-            vst1q_lane_f32(output, r0, 0);
-            break;
-          case 2:
-            vst1_f32(output, vget_low_f32(r0));
-            break;
-          case 3:
-            vst1_f32(output, vget_low_f32(r0));
-            vst1q_lane_f32(output, r0, 2);
-            break;
-        }
-      }
-#else
-      for (int k = 0; k < elementwise_num; ++k) {
-        output[k] = math::Active<Act>(input[k] + bias);
-      }
-#endif  // __ARM_NEON__
-    }
-
-  } else {
-    // axis = -1 represent the last dimensions.
-    int dim = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-    size_t batch = 1;
-    size_t channels = 1;
-    size_t elementwise_num = 1;
-    for (int i = 0; i < dim; ++i) {
-      batch *= x_dims[i];
-    }
-    for (int i = 0; i < y_dims.size(); ++i) {
-      channels *= y_dims[i];
-    }
-    for (int i = y_dims.size() + dim; i < x_dims.size(); ++i) {
-      elementwise_num *= x_dims[i];
-    }
-
-#pragma omp parallel for collapse(2)
-    for (int i = 0; i < batch; ++i) {
-      for (int j = 0; j < channels; ++j) {
-        size_t offset = (i * channels + j) * elementwise_num;
-        const float *input = input_data + offset;
-        const float bias = bias_data[j];
-        float *output = output_data + offset;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-        int loop = elementwise_num >> 0x4;
-        int remain = elementwise_num & 0xF;
-        float32x4_t rb = vdupq_n_f32(bias);
-        for (int k = 0; k < loop; ++k) {
-          float32x4_t r0 = vld1q_f32(input);
-          float32x4_t r1 = vld1q_f32(input + 4);
-          float32x4_t r2 = vld1q_f32(input + 8);
-          float32x4_t r3 = vld1q_f32(input + 12);
-          r0 = vaddq_f32(r0, rb);
-          r1 = vaddq_f32(r1, rb);
-          r2 = vaddq_f32(r2, rb);
-          r3 = vaddq_f32(r3, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          r1 = math::vActiveq_f32<Act>(r1);
-          r2 = math::vActiveq_f32<Act>(r2);
-          r3 = math::vActiveq_f32<Act>(r3);
-          vst1q_f32(output, r0);
-          vst1q_f32(output + 4, r1);
-          vst1q_f32(output + 8, r2);
-          vst1q_f32(output + 12, r3);
-          input += 16;
-          output += 16;
-        }
-        if (remain >= 8) {
-          float32x4_t r0 = vld1q_f32(input);
-          float32x4_t r1 = vld1q_f32(input + 4);
-          r0 = vaddq_f32(r0, rb);
-          r1 = vaddq_f32(r1, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          r1 = math::vActiveq_f32<Act>(r1);
-          vst1q_f32(output, r0);
-          vst1q_f32(output + 4, r1);
-          input += 8;
-          output += 8;
-          remain -= 8;
-        }
-        if (remain >= 4) {
-          float32x4_t r0 = vld1q_f32(input);
-          r0 = vaddq_f32(r0, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          vst1q_f32(output, r0);
-          input += 4;
-          output += 4;
-          remain -= 4;
-        }
-        if (remain > 0) {
-          float32x4_t r0 = vld1q_f32(input);
-          r0 = vaddq_f32(r0, rb);
-          r0 = math::vActiveq_f32<Act>(r0);
-          switch (remain) {
-            case 1:
-              vst1q_lane_f32(output, r0, 0);
-              break;
-            case 2:
-              vst1_f32(output, vget_low_f32(r0));
-              break;
-            case 3:
-              vst1_f32(output, vget_low_f32(r0));
-              vst1q_lane_f32(output, r0, 2);
-              break;
-          }
-        }
-#else
-        for (int k = 0; k < elementwise_num; ++k) {
-          output[k] = math::Active<Act>(input[k] + bias);
-        }
-#endif  // __ARM_NEON__
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/elementwise_op_function.h b/mobile/src/operators/math/elementwise_op_function.h
deleted file mode 100644
index 95fd037988b1401597d17a58f12fc4c460045a33..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/elementwise_op_function.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "transform.h"
-
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-
-namespace paddle_mobile {
-namespace operators {
-
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
- */
-inline void get_mid_dims(const framework::DDim &x_dims,
-                         const framework::DDim &y_dims, const int axis,
-                         int *pre, int *n, int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  // compute pre
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    assert(x_dims[i + axis] == y_dims[i]);
-    /// "Broadcast dimension mismatch.");
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-}
-
-/// remove dims tail 1. (4,20,1,1) -> (4,20)
-inline void trim_trailing_singular_dims(framework::DDim *dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims->size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if ((*dims)[actual_dims_size - 1] != 1) break;
-  }
-  if (actual_dims_size != dims->size()) {
-    auto actual_dims = framework::vectorize(*dims);
-    actual_dims.resize(actual_dims_size);
-    *dims = framework::make_ddim(actual_dims);
-  }
-}
-
-/// (4,20,2)+(20,): (20,) just as (20,1), when move 2 strides in last
-/// dimension
-/// in (4,20,2) is 2 ,
-/// (20,1) move 1 stride , to fill(add) 2 element with the same number.
-template <typename T>
-class MidWiseTransformIterator {
- public:
-  MidWiseTransformIterator(const T *ptr, int n, int post)
-      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
-
-  MidWiseTransformIterator<T> &operator++() {
-    if (post_ != 1) {
-      ++j_;
-      if (UNLIKELY(j_ == post_)) {
-        ++i_;
-        j_ = 0;
-        if (UNLIKELY(i_ == n_)) {
-          i_ = 0;
-        }
-      }
-      return *this;
-    } else {
-      ++i_;
-      if (UNLIKELY(i_ == n_)) {
-        i_ = 0;
-      }
-      return *this;
-    }
-  }
-
-  bool operator==(const MidWiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) == &(*rhs);
-  }
-
-  bool operator!=(const MidWiseTransformIterator<T> &rhs) const {
-    return (ptr_ + i_) != &(*rhs);
-  }
-
-  const T &operator*() { return ptr_[i_]; }
-
- private:
-  const T *ptr_;
-  int64_t i_;
-  int64_t j_;
-  int64_t n_;
-  int64_t post_;
-};
-
-template <typename Functor, typename T, typename OutType = T>
-class TransformFunctor {
- public:
-  TransformFunctor(const framework::Tensor *x, const framework::Tensor *y,
-                   framework::Tensor *z, Functor func)
-      : x_(x->data<T>()),
-        y_(y->data<T>()),
-        z_(z->mutable_data<OutType>()),
-        nx_(x->numel()),
-        func_(func) {}
-
-  inline void Run() const {
-    math::Transform trans;
-    // 同时执行func(x_, y_)传入z_。
-    trans(x_, x_ + nx_, y_, z_, func_);
-  }
-
-  inline void RunMidWise(int n, int pre, int post) const {
-    math::Transform trans;
-    trans(x_, x_ + nx_, MidWiseTransformIterator<T>(y_, n, post), z_, func_);
-  }
-
- private:
-  const T *x_;
-  const T *y_;
-  OutType *z_;
-  int64_t nx_;
-  Functor func_;
-};
-
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseComputeEx(const framework::Tensor *x,
-                          const framework::Tensor *y, int axis, Functor func,
-                          framework::Tensor *z) {
-  TransformFunctor<Functor, T, OutType> functor(x, y, z, func);
-
-  auto x_dims = x->dims();
-  auto y_dims = y->dims();
-  PADDLE_MOBILE_ENFORCE(x_dims.size() >= y_dims.size(),
-                        "Rank of first input must >= rank of second input.");
-
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  /// axis = -1 represent the last dimensions.
-  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
-  PADDLE_MOBILE_ENFORCE(axis >= 0 && axis < x_dims.size(),
-                        "Axis should be in range [0, x_dims)");
-  trim_trailing_singular_dims(&y_dims);
-  axis = (y_dims.size() == 0) ? x_dims.size() : axis;
-
-  int pre, n, post;
-  get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post);
-
-  functor.RunMidWise(n, pre, post);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm.cpp b/mobile/src/operators/math/gemm.cpp
deleted file mode 100644
index 1fa78d161621b3c7928a0ce6b554c14aac3fd6b6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm.cpp
+++ /dev/null
@@ -1,3807 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/gemm.h"
-#include <string.h>
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#if __ARM_NEON
-inline float32x4_t vandq_f32(float32x4_t x, uint32x4_t mask) {
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-}
-#endif
-
-void Gemm::PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                          float *buffer, const bool parallel) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
-  int remain_k = k & 0x3;
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
-
-  #pragma omp parallel for if (parallel)
-  for (int i = 0; i < m - 5; i += 6) {
-    const float *a0 = A + i * lda;
-    const float *a1 = A + (i + 1) * lda;
-    const float *a2 = A + (i + 2) * lda;
-    const float *a3 = A + (i + 3) * lda;
-    const float *a4 = A + (i + 4) * lda;
-    const float *a5 = A + (i + 5) * lda;
-    float *out_ptr = buffer + i * k;
-
-    int loops = k >> 2;
-    if (loops > 0) {
-#if __aarch64__
-      for (int l = 0; l < loops; ++l) {
-        float32x4_t _d0 = vld1q_f32(a0);
-        float32x4_t _d1 = vld1q_f32(a1);
-        float32x4_t _d2 = vld1q_f32(a2);
-        float32x4_t _d3 = vld1q_f32(a3);
-        float32x4_t _d4 = vld1q_f32(a4);
-        float32x4_t _d5 = vld1q_f32(a5);
-
-        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-        _d2 =
-            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-        _d3 =
-            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-        vst1q_f32(out_ptr, _d0);
-        vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 6, _d1);
-        vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        vst1q_f32(out_ptr + 12, _d2);
-        vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 18, _d3);
-        vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
-
-        a0 += 4;
-        a1 += 4;
-        a2 += 4;
-        a3 += 4;
-        a4 += 4;
-        a5 += 4;
-        out_ptr += 24;
-      }
-#else
-      asm volatile(
-          "loop_4k_%=:                        \n"
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-
-          "subs       %[loops], #1            \n"
-          "bne        loop_4k_%=              \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-          :
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-
-    if (remain_k > 0) {
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      _d0 = vandq_f32(_d0, vmask1);
-      _d1 = vandq_f32(_d1, vmask1);
-      _d2 = vandq_f32(_d2, vmask1);
-      _d3 = vandq_f32(_d3, vmask1);
-      _d4 = vandq_f32(_d4, vmask1);
-      _d5 = vandq_f32(_d5, vmask1);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-
-      switch (remain_k) {
-        case 3:
-          vst1q_f32(out_ptr + 12, _d2);
-          vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        case 2:
-          vst1q_f32(out_ptr + 6, _d1);
-          vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        case 1:
-          vst1q_f32(out_ptr, _d0);
-          vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        default:
-          break;
-      }
-    }
-  }
-
-  int remain_m = m % 6;
-  if (remain_m) {
-    int remain_m_start = m - remain_m;
-    const float *a0 = A + remain_m_start * lda;
-    const float *a1 = a0 + lda;
-    const float *a2 = a0 + 2 * lda;
-    const float *a3 = a0 + 3 * lda;
-    const float *a4 = a0 + 4 * lda;
-    const float *a5 = a0 + 5 * lda;
-    float *out_ptr = buffer + remain_m_start * k;
-
-    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
-    uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m));
-
-    int loops = k >> 2;
-    if (loops > 0) {
-#if __aarch64__
-      for (int l = 0; l < loops; ++l) {
-        float32x4_t _d0 = vld1q_f32(a0);
-        float32x4_t _d1 = vld1q_f32(a1);
-        float32x4_t _d2 = vld1q_f32(a2);
-        float32x4_t _d3 = vld1q_f32(a3);
-        float32x4_t _d4 = vld1q_f32(a4);
-        float32x4_t _d5 = vld1q_f32(a5);
-
-        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-        _d2 =
-            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-        _d3 =
-            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-        _d0 = vandq_f32(_d0, vmask2);
-        _d1 = vandq_f32(_d1, vmask2);
-        _d2 = vandq_f32(_d2, vmask2);
-        _d3 = vandq_f32(_d3, vmask2);
-        _d4 = vandq_f32(_q3.val[0], vmask3);
-        _d5 = vandq_f32(_q3.val[1], vmask3);
-
-        vst1q_f32(out_ptr, _d0);
-        vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-        vst1q_f32(out_ptr + 6, _d1);
-        vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-        vst1q_f32(out_ptr + 12, _d2);
-        vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-        vst1q_f32(out_ptr + 18, _d3);
-        vst1_f32(out_ptr + 22, vget_high_f32(_d5));
-
-        a0 += 4;
-        a1 += 4;
-        a2 += 4;
-        a3 += 4;
-        a4 += 4;
-        a5 += 4;
-        out_ptr += 24;
-      }
-#else
-      asm volatile(
-          "loop_4k_%=:                        \n"
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vbif       q0, %q[vzero], %q[vmask2] \n"
-          "vbif       q1, %q[vzero], %q[vmask2] \n"
-          "vbif       q2, %q[vzero], %q[vmask2] \n"
-          "vbif       q3, %q[vzero], %q[vmask2] \n"
-          "vbif       q4, %q[vzero], %q[vmask3] \n"
-          "vbif       q5, %q[vzero], %q[vmask3] \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-
-          "subs       %[loops], #1            \n"
-          "bne        loop_4k_%=              \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-          : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-
-    if (remain_k > 0) {
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      _d0 = vandq_f32(_d0, vmask1);
-      _d1 = vandq_f32(_d1, vmask1);
-      _d2 = vandq_f32(_d2, vmask1);
-      _d3 = vandq_f32(_d3, vmask1);
-      _d4 = vandq_f32(_d4, vmask1);
-      _d5 = vandq_f32(_d5, vmask1);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-      // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]),
-      // vget_high_f32(_q1.val[1]));
-
-      _d0 = vandq_f32(_d0, vmask2);
-      _d1 = vandq_f32(_d1, vmask2);
-      _d2 = vandq_f32(_d2, vmask2);
-      // _d3 = vandq_f32(_d3, vmask2);
-      _d4 = vandq_f32(_q3.val[0], vmask3);
-      _d5 = vandq_f32(_q3.val[1], vmask3);
-
-      switch (remain_k) {
-        case 3:
-          vst1q_f32(out_ptr + 12, _d2);
-          vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-        case 2:
-          vst1q_f32(out_ptr + 6, _d1);
-          vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-        case 1:
-          vst1q_f32(out_ptr, _d0);
-          vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-        default:
-          break;
-      }
-    }
-  }
-}
-
-// 将B矩阵分块复制到连续内存(RowMajor)
-void Gemm::PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                          float *buffer, const bool parallel) {
-  const int j_length = n - n_tail;
-
-  #pragma omp parallel for if (parallel)
-  for (int i = 0; i < k; ++i) {
-    int j = 0;
-    for (; j < j_length - 31; j += 32) {
-      float *local_buffer0 = buffer + j * k + i * NR;
-      float *local_buffer1 = buffer + (j + 8) * k + i * NR;
-      float *local_buffer2 = buffer + (j + 16) * k + i * NR;
-      float *local_buffer3 = buffer + (j + 24) * k + i * NR;
-      const float *b0 = B + i * ldb + j;
-#if __aarch64__
-      asm volatile(
-          "prfm   pldl1keep,       [%[b0]]                 \n"
-          "ld1    {v0.4s, v1.4s},  [%[b0]], #32            \n"
-          "ld1    {v2.4s, v3.4s},  [%[b0]], #32            \n"
-          "ld1    {v4.4s, v5.4s},  [%[b0]], #32            \n"
-          "ld1    {v6.4s, v7.4s},  [%[b0]]                 \n"
-          "st1    {v0.4s, v1.4s},  [%[local_buffer0]], #32 \n"
-          "st1    {v2.4s, v3.4s},  [%[local_buffer1]], #32 \n"
-          "st1    {v4.4s, v5.4s},  [%[local_buffer2]], #32 \n"
-          "st1    {v6.4s, v7.4s},  [%[local_buffer3]], #32 \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1),
-            [local_buffer2] "+r"(local_buffer2),
-            [local_buffer3] "+r"(local_buffer3), [b0] "+r"(b0)
-          :
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-#else
-      asm volatile(
-          // "pld      [%[b]]                          \n"
-          "vld1.32  {q0, q1},   [%[b0]]!             \n"
-          "vld1.32  {q2, q3},   [%[b0]]!             \n"
-          "vld1.32  {q4, q5},   [%[b0]]!             \n"
-          "vld1.32  {q6, q7},   [%[b0]]!             \n"
-          "vst1.32  {q0, q1},   [%[local_buffer0]]!  \n"
-          "vst1.32  {q2, q3},   [%[local_buffer1]]!  \n"
-          "vst1.32  {q4, q5},   [%[local_buffer2]]!  \n"
-          "vst1.32  {q6, q7},   [%[local_buffer3]]!  \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1),
-            [local_buffer2] "+r"(local_buffer2),
-            [local_buffer3] "+r"(local_buffer3), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif  // __aarch64__
-    }
-    for (; j < j_length - 15; j += 16) {
-      float *local_buffer0 = buffer + j * k + i * NR;
-      float *local_buffer1 = buffer + (j + 8) * k + i * NR;
-      const float *b0 = &B(i, j);
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "prfm   pldl1keep,        [%[b0]]            \n"
-          "ld1    {v0.4s, v1.4s},   [%[b0]], #32       \n"
-          "ld1    {v2.4s, v3.4s},   [%[b0]]            \n"
-          "st1    {v0.4s, v1.4s},   [%[local_buffer0]],  #32 \n"
-          "st1    {v2.4s, v3.4s},   [%[local_buffer1]],  #32 \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1), [b0] "+r"(b0)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          //          "pld        [%[b0]]                     \n"
-          "vld1.32    {q0, q1},   [%[b0]]!               \n"
-          "vld1.32    {q2, q3},   [%[b0]]                \n"
-          "vst1.32    {q0, q1},   [%[local_buffer0]]!    \n"
-          "vst1.32    {q2, q3},   [%[local_buffer1]]!    \n"
-          : [local_buffer0] "+r"(local_buffer0),
-            [local_buffer1] "+r"(local_buffer1), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-    }
-    for (; j < j_length; j += NR) {
-      float *local_buffer = buffer + j * k + i * NR;
-      const float *b0 = &B(i, j);
-#if __aarch64__
-      asm volatile(
-          "prfm     pldl1keep,       [%[b0]]            \n"
-          "ld1      {v0.4s, v1.4s},  [%[b0]]            \n"
-          "st1      {v0.4s, v1.4s},  [%[local_buffer]], #32 \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "v0", "v1");
-#else
-      asm volatile(
-          // "pld      [%[b]]                          \n"
-          "vld1.32  {q0, q1},   [%[b0]]              \n"
-          "vst1.32  {q0, q1},   [%[local_buffer]]        \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "q0", "q1");
-#endif  // __aarch64__
-    }
-  }
-  if (n_tail != 0) {
-    uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-    uint32x4_t vzero = vdupq_n_u32(0);
-    uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(n_tail));
-    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(n_tail));
-
-    float *local_buffer = buffer + j_length * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j_length);
-#if __aarch64__
-      asm volatile(
-          "prfm   pldl1keep,       [%[b0]]            \n"
-          "ld1    {v0.4s, v1.4s},  [%[b0]]            \n"
-          "BIF    v0.8b, %[vzero].8b, %[vmask1].8b    \n"
-          "BIF    v1.8b, %[vzero].8b, %[vmask2].8b    \n"
-          "st1      {v0.4s, v1.4s},  [%[local_buffer]], #32 \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero),
-            [b0] "r"(b0)
-          : "memory", "v0", "v1");
-#else
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]              \n"
-          "vbif     q0, %q[vzero], %q[vmask1]        \n"
-          "vbif     q1, %q[vzero], %q[vmask2]        \n"
-          "vst1.32  {q0, q1},   [%[local_buffer]]!   \n"
-          : [local_buffer] "+r"(local_buffer)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero),
-            [b0] "r"(b0)
-          : "memory", "q0", "q1");
-#endif
-    }
-  }
-}
-
-#if __ARM_NEON
-#if __aarch64__
-void Gemm::PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer, const bool parallel) {
-  const int j_length = n - n_tail;
-
-  #pragma omp parallel for if (parallel)
-  for (int j = 0; j < j_length; j += NR) {
-    float *local_buffer = buffer + j * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j);
-      asm volatile(
-          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
-          "ld1    {v0.4s, v1.4s, v2.4s},   [%[b0]]           \n\t"
-          "st1    {v0.4s, v1.4s, v2.4s},   [%[local_buffer]],  #48 \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "v0", "v1", "v2");
-    }
-  }
-  if (n_tail != 0) {
-    float *local_buffer = buffer + j_length * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j_length);
-      for (int j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int j = n; j < j_length + NR; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-void Gemm::PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                           float *buffer, const bool parallel) {
-  const int j_length = n - n_tail;
-
-  #pragma omp parallel for if (parallel)
-  for (int j = 0; j < n - n_tail; j += NR) {
-    float *local_buffer = buffer + j * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j);
-      asm volatile(
-          "prfm   pldl2keep,        [%[b0], #64]           \n\t"
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[b0]]           \n\t"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},   [%[local_buffer]],  #64 \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "v0", "v1", "v2", "v3");
-    }
-  }
-  if (n_tail != 0) {
-    float *local_buffer = buffer + j_length * k;
-    for (int i = 0; i < k; ++i) {
-      const float *b0 = &B(i, j_length);
-      for (int j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int j = n; j < j_length + NR; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-// 分块矩阵乘法
-void Gemm::InnerKernel(int mc, int nc, float alpha, const float *a,
-                       const float *b, float beta, float *c, float *C, int ldc,
-                       bool relu) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-
-  if (alpha != 1) {
-    WriteWithAlphaBeta(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 0) {
-    WriteBasic(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 1 && !relu) {
-    WriteWithAdd(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 1 && relu) {
-    WriteWithAddRelu(mc, nc, c, C, ldc);
-    return;
-  }
-}
-
-// 分块矩阵乘法
-void Gemm::InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
-                               const float *b, float beta, float *c, float *C,
-                               int ldc, bool relu, float *bias) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-
-  if (alpha != 1) {
-    WriteWithAlphaBeta(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 0) {
-    WriteBasic(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 1 && !relu) {
-    if (bias == nullptr) {
-      WriteWithAdd(mc, nc, c, C, ldc);
-    } else {
-      WriteWithAddV1(mc, nc, c, C, ldc, bias);
-    }
-    return;
-  }
-  if (beta == 1 && relu) {
-    if (bias == nullptr) {
-      WriteWithAddRelu(mc, nc, c, C, ldc);
-    } else {
-      WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
-    }
-    return;
-  }
-}
-
-// 分块矩阵乘法
-void Gemm::InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                             const float *b, float beta, float *c, float *C,
-                             int ldc, bool relu, float *new_scale,
-                             float *new_bias) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-
-  if (relu) {
-    WriteWithBnRelu(mc, nc, c, C, ldc, new_scale, new_bias);
-  } else {
-    WriteWithBn(mc, nc, c, C, ldc, new_scale, new_bias);
-  }
-}
-
-// 分块矩阵乘法
-void Gemm::InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                                const float *b, float beta, float *c, float *C,
-                                int ldc, bool relu, float *new_scale,
-                                float *new_bias, float *bias) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-  WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
-}
-
-void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                                float *c, float *C, int ldc, float *p,
-                                std::string mode, float *bias, float *bias1) {
-#pragma omp parallel for
-  for (int j = 0; j < nc; j += NR) {
-    for (int i = 0; i < mc; i += MR) {
-#if __aarch64__
-      // AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x16(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      // AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif
-    }
-  }
-  WriteWithAddPRelu(mc, nc, c, C, ldc, p, mode, bias, bias1);
-}
-
-#if __ARM_NEON
-#if __aarch64__
-
-void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
-  // init C
-  float32x4_t cv0 = vdupq_n_f32(0.0);
-  float32x4_t cv1 = vdupq_n_f32(0.0);
-  float32x4_t cv2 = vdupq_n_f32(0.0);
-  float32x4_t cv3 = vdupq_n_f32(0.0);
-  float32x4_t cv4 = vdupq_n_f32(0.0);
-  float32x4_t cv5 = vdupq_n_f32(0.0);
-  float32x4_t cv6 = vdupq_n_f32(0.0);
-  float32x4_t cv7 = vdupq_n_f32(0.0);
-  float32x4_t cv8 = vdupq_n_f32(0.0);
-  float32x4_t cv9 = vdupq_n_f32(0.0);
-  float32x4_t cv10 = vdupq_n_f32(0.0);
-  float32x4_t cv11 = vdupq_n_f32(0.0);
-
-  float32x4_t av;
-  float32x4_t bv0;
-  float32x4_t bv1;
-
-  float32x2_t av01;
-  float32x2_t av23;
-  float32x2_t av45;
-
-  for (int p = 0; p < k; p += 1) {
-    av = vld1q_f32(a);
-    av01 = vget_low_f32(av);
-    av23 = vget_high_f32(av);
-    av45 = vld1_f32(a + 4);
-    bv0 = vld1q_f32(b);
-    bv1 = vld1q_f32(b + 4);
-
-    cv0 = vmlaq_lane_f32(cv0, bv0, av01, 0);
-    cv1 = vmlaq_lane_f32(cv1, bv1, av01, 0);
-    cv2 = vmlaq_lane_f32(cv2, bv0, av01, 1);
-    cv3 = vmlaq_lane_f32(cv3, bv1, av01, 1);
-
-    cv4 = vmlaq_lane_f32(cv4, bv0, av23, 0);
-    cv5 = vmlaq_lane_f32(cv5, bv1, av23, 0);
-    cv6 = vmlaq_lane_f32(cv6, bv0, av23, 1);
-    cv7 = vmlaq_lane_f32(cv7, bv1, av23, 1);
-
-    cv8 = vmlaq_lane_f32(cv8, bv0, av45, 0);
-    cv9 = vmlaq_lane_f32(cv9, bv1, av45, 0);
-    cv10 = vmlaq_lane_f32(cv10, bv0, av45, 1);
-    cv11 = vmlaq_lane_f32(cv11, bv1, av45, 1);
-
-    a += MR;
-    b += NR;
-  }
-
-  vst1q_f32(c, cv0);
-  vst1q_f32(c + 4, cv1);
-  vst1q_f32(c + ldc, cv2);
-  vst1q_f32(c + ldc + 4, cv3);
-  vst1q_f32(c + 2 * ldc, cv4);
-  vst1q_f32(c + 2 * ldc + 4, cv5);
-  vst1q_f32(c + 3 * ldc, cv6);
-  vst1q_f32(c + 3 * ldc + 4, cv7);
-  vst1q_f32(c + 4 * ldc, cv8);
-  vst1q_f32(c + 4 * ldc + 4, cv9);
-  vst1q_f32(c + 5 * ldc, cv10);
-  vst1q_f32(c + 5 * ldc + 4, cv11);
-}
-
-void Gemm::AddDot8x12(int k, const float *a, const float *b, float *c,
-                      int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k;
-  int step = 4 * ldc;
-  asm volatile(
-      "dup      v5.4s,     wzr     \n\t"
-      "dup      v6.4s,     wzr     \n\t"
-      "dup      v7.4s,     wzr     \n\t"
-      "dup      v8.4s,     wzr     \n\t"
-      "dup      v9.4s,     wzr     \n\t"
-      "dup      v10.4s,    wzr     \n\t"
-      "dup      v11.4s,    wzr     \n\t"
-      "dup      v12.4s,    wzr     \n\t"
-      "dup      v13.4s,    wzr     \n\t"
-      "dup      v14.4s,    wzr     \n\t"
-      "dup      v15.4s,    wzr     \n\t"
-      "dup      v16.4s,    wzr     \n\t"
-
-      "dup      v17.4s,    wzr     \n\t"
-      "dup      v18.4s,    wzr     \n\t"
-      "dup      v19.4s,    wzr     \n\t"
-      "dup      v20.4s,    wzr     \n\t"
-      "dup      v21.4s,    wzr     \n\t"
-      "dup      v22.4s,    wzr     \n\t"
-      "dup      v23.4s,    wzr     \n\t"
-      "dup      v24.4s,    wzr     \n\t"
-      "dup      v25.4s,    wzr     \n\t"
-      "dup      v26.4s,    wzr     \n\t"
-      "dup      v27.4s,    wzr     \n\t"
-      "dup      v28.4s,    wzr     \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "prfm     pldl1keep,         [%[a_ptr],   #32]  \n\t"
-      "prfm     pldl1keep,         [%[b_ptr],   #48]  \n\t"
-
-      "ld1      {v0.4s, v1.4s},         [%[a_ptr]],   #32   \n\t"
-      "ld1      {v2.4s, v3.4s, v4.4s},  [%[b_ptr]],   #48   \n\t"
-
-      "fmla     v5.4s,    v2.4s,   v0.s[0]       \n\t"
-      "fmla     v6.4s,    v3.4s,   v0.s[0]       \n\t"
-      "fmla     v7.4s,    v4.4s,   v0.s[0]       \n\t"
-      "fmla     v8.4s,    v2.4s,   v0.s[1]       \n\t"
-      "fmla     v9.4s,    v3.4s,   v0.s[1]       \n\t"
-      "fmla     v10.4s,   v4.4s,   v0.s[1]       \n\t"
-      "fmla     v11.4s,   v2.4s,   v0.s[2]       \n\t"
-      "fmla     v12.4s,   v3.4s,   v0.s[2]       \n\t"
-      "fmla     v13.4s,   v4.4s,   v0.s[2]       \n\t"
-      "fmla     v14.4s,   v2.4s,   v0.s[3]       \n\t"
-      "fmla     v15.4s,   v3.4s,   v0.s[3]       \n\t"
-      "fmla     v16.4s,   v4.4s,   v0.s[3]       \n\t"
-
-      "fmla     v17.4s,   v2.4s,   v1.s[0]       \n\t"
-      "fmla     v18.4s,   v3.4s,   v1.s[0]       \n\t"
-      "fmla     v19.4s,   v4.4s,   v1.s[0]       \n\t"
-      "fmla     v20.4s,   v2.4s,   v1.s[1]       \n\t"
-      "fmla     v21.4s,   v3.4s,   v1.s[1]       \n\t"
-      "fmla     v22.4s,   v4.4s,   v1.s[1]       \n\t"
-      "fmla     v23.4s,   v2.4s,   v1.s[2]       \n\t"
-      "fmla     v24.4s,   v3.4s,   v1.s[2]       \n\t"
-      "fmla     v25.4s,   v4.4s,   v1.s[2]       \n\t"
-      "fmla     v26.4s,   v2.4s,   v1.s[3]       \n\t"
-      "fmla     v27.4s,   v3.4s,   v1.s[3]       \n\t"
-      "fmla     v28.4s,   v4.4s,   v1.s[3]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "st1      {v5.4s,   v6.4s,  v7.4s},    [%[c]],   %[step]   \n\t"
-      "st1      {v8.4s,   v9.4s,  v10.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v11.4s,  v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v14.4s,  v15.4s, v16.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v17.4s,  v18.4s, v19.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v20.4s,  v21.4s, v22.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v23.4s,  v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v26.4s,  v27.4s, v28.4s},   [%[c]],   %[step]   \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [step] "r"(step)
-      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
-}
-
-void Gemm::AddDot6x16(int k, const float *a, const float *b, float *c,
-                      int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k;
-  int step = 4 * ldc;
-  int step1 = 4 * 6;
-  asm volatile(
-
-      "dup      v6.4s,     wzr     \n\t"
-      "dup      v7.4s,     wzr     \n\t"
-      "dup      v8.4s,     wzr     \n\t"
-      "dup      v9.4s,     wzr     \n\t"
-      "dup      v10.4s,    wzr     \n\t"
-      "dup      v11.4s,    wzr     \n\t"
-      "dup      v12.4s,    wzr     \n\t"
-      "dup      v13.4s,    wzr     \n\t"
-
-      "dup      v14.4s,    wzr     \n\t"
-      "dup      v15.4s,    wzr     \n\t"
-      "dup      v16.4s,    wzr     \n\t"
-      "dup      v17.4s,    wzr     \n\t"
-      "dup      v18.4s,    wzr     \n\t"
-      "dup      v19.4s,    wzr     \n\t"
-      "dup      v20.4s,    wzr     \n\t"
-      "dup      v21.4s,    wzr     \n\t"
-
-      "dup      v22.4s,    wzr     \n\t"
-      "dup      v23.4s,    wzr     \n\t"
-      "dup      v24.4s,    wzr     \n\t"
-      "dup      v25.4s,    wzr     \n\t"
-      "dup      v26.4s,    wzr     \n\t"
-      "dup      v27.4s,    wzr     \n\t"
-      "dup      v28.4s,    wzr     \n\t"
-      "dup      v29.4s,    wzr     \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "prfm   pldl1keep,  [%[a_ptr],  #24]  \n\t"
-      "prfm   pldl1keep,  [%[b_ptr],  #64]  \n\t"
-
-      "ld1      {v0.4s, v1.4s},  [%[a_ptr]],   %[step1]       \n\t"
-      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s},  [%[b_ptr]],    #64   \n\t"
-
-      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
-      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
-      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
-      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
-
-      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
-      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
-      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
-      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
-
-      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
-      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
-      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
-      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
-
-      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
-      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
-      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
-      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
-
-      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
-      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
-      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
-      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
-
-      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
-      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
-      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
-      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
-      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [step] "r"(step), [step1] "r"(step1)
-      : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
-        "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-        "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29");
-}
-
-#else
-
-void Gemm::AddDot4x4(int k, const float *a, const float *b, float *c, int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k / 4;
-  int kc2 = k % 4;
-  int step = 4 * ldc;
-  asm volatile(
-      "pld        [%[a_ptr]]          \n\t"
-      "pld        [%[b_ptr]]          \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-      "pld        [%[a_ptr], #64]     \n\t"
-      "pld        [%[b_ptr], #64]     \n\t"
-      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q2, d0[0]      \n\t"
-      "vmla.f32   q11, q2, d0[1]      \n\t"
-      "vmla.f32   q12, q2, d1[0]      \n\t"
-      "vmla.f32   q13, q2, d1[1]      \n\t"
-      "vmla.f32   q10, q3, d2[0]      \n\t"
-      "vmla.f32   q11, q3, d2[1]      \n\t"
-      "vmla.f32   q12, q3, d3[0]      \n\t"
-      "vmla.f32   q13, q3, d3[1]      \n\t"
-      "vld1.32    {q4, q5}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q6, q7}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q6, d8[0]      \n\t"
-      "vmla.f32   q11, q6, d8[1]      \n\t"
-      "vmla.f32   q12, q6, d9[0]      \n\t"
-      "vmla.f32   q13, q6, d9[1]      \n\t"
-      "vmla.f32   q10, q7, d10[0]     \n\t"
-      "vmla.f32   q11, q7, d10[1]     \n\t"
-      "vmla.f32   q12, q7, d11[0]     \n\t"
-      "vmla.f32   q13, q7, d11[1]     \n\t"
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q1}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q10, q1, d0[0]      \n\t"
-      "vmla.f32   q11, q1, d0[1]      \n\t"
-      "vmla.f32   q12, q1, d1[0]      \n\t"
-      "vmla.f32   q13, q1, d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
-
-      "mov        r5,     %[c]        \n\t"
-      "mov        r6,     %[step]     \n\t"
-      "vst1.32    {q10}, [r5], r6     \n\t"
-      "vst1.32    {q11}, [r5], r6     \n\t"
-      "vst1.32    {q12}, [r5], r6     \n\t"
-      "vst1.32    {q13}, [r5]         \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q10", "q11", "q12", "q13");
-}
-
-void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k / 4;
-  int kc2 = k % 4;
-  int step = 4 * ldc;
-  asm volatile(
-      "pld        [%[a_ptr]]          \n\t"
-      "pld        [%[b_ptr]]          \n\t"
-
-      "vmov.f32   q8,     #0.0        \n\t"
-      "vmov.f32   q9,     #0.0        \n\t"
-      "vmov.f32   q10,    #0.0        \n\t"
-      "vmov.f32   q11,    #0.0        \n\t"
-      "vmov.f32   q12,    #0.0        \n\t"
-      "vmov.f32   q13,    #0.0        \n\t"
-      "vmov.f32   q14,    #0.0        \n\t"
-      "vmov.f32   q15,    #0.0        \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "blt        end_kc1_%=          \n\t"
-      "loop_kc1_%=:                   \n\t"
-
-      "pld        [%[a_ptr], #64]     \n\t"
-      "pld        [%[b_ptr], #64]     \n\t"
-
-      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q8,   q2,   d0[0]      \n\t"
-      "vmla.f32   q9,   q3,   d0[0]      \n\t"
-      "vmla.f32   q10,  q2,   d0[1]      \n\t"
-      "vmla.f32   q11,  q3,   d0[1]      \n\t"
-      "vmla.f32   q12,  q2,   d1[0]      \n\t"
-      "vmla.f32   q13,  q3,   d1[0]      \n\t"
-      "vmla.f32   q14,  q2,   d1[1]      \n\t"
-      "vmla.f32   q15,  q3,   d1[1]      \n\t"
-
-      "vmla.f32   q8,   q4,   d2[0]      \n\t"
-      "vmla.f32   q9,   q5,   d2[0]      \n\t"
-      "vmla.f32   q10,  q4,   d2[1]      \n\t"
-      "vmla.f32   q11,  q5,   d2[1]      \n\t"
-      "vmla.f32   q12,  q4,   d3[0]      \n\t"
-      "vmla.f32   q13,  q5,   d3[0]      \n\t"
-      "vmla.f32   q14,  q4,   d3[1]      \n\t"
-      "vmla.f32   q15,  q5,   d3[1]      \n\t"
-
-      "pld        [%[b_ptr], #64]     \n\t"
-
-      "vld1.32    {q0, q1}, [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vld1.32    {q4, q5}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q8,   q2,   d0[0]      \n\t"
-      "vmla.f32   q9,   q3,   d0[0]      \n\t"
-      "vmla.f32   q10,  q2,   d0[1]      \n\t"
-      "vmla.f32   q11,  q3,   d0[1]      \n\t"
-      "vmla.f32   q12,  q2,   d1[0]      \n\t"
-      "vmla.f32   q13,  q3,   d1[0]      \n\t"
-      "vmla.f32   q14,  q2,   d1[1]      \n\t"
-      "vmla.f32   q15,  q3,   d1[1]      \n\t"
-
-      "vmla.f32   q8,   q4,   d2[0]      \n\t"
-      "vmla.f32   q9,   q5,   d2[0]      \n\t"
-      "vmla.f32   q10,  q4,   d2[1]      \n\t"
-      "vmla.f32   q11,  q5,   d2[1]      \n\t"
-      "vmla.f32   q12,  q4,   d3[0]      \n\t"
-      "vmla.f32   q13,  q5,   d3[0]      \n\t"
-      "vmla.f32   q14,  q4,   d3[1]      \n\t"
-      "vmla.f32   q15,  q5,   d3[1]      \n\t"
-
-      "subs       %[kc1], %[kc1], #1  \n\t"
-      "bge        loop_kc1_%=         \n\t"
-      "end_kc1_%=:                    \n\t"
-
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "blt        end_kc2_%=          \n\t"
-      "loop_kc2_%=:                   \n\t"
-      "vld1.32    {q0},     [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-      "vmla.f32   q8,   q2,   d0[0]      \n\t"
-      "vmla.f32   q9,   q3,   d0[0]      \n\t"
-      "vmla.f32   q10,  q2,   d0[1]      \n\t"
-      "vmla.f32   q11,  q3,   d0[1]      \n\t"
-      "vmla.f32   q12,  q2,   d1[0]      \n\t"
-      "vmla.f32   q13,  q3,   d1[0]      \n\t"
-      "vmla.f32   q14,  q2,   d1[1]      \n\t"
-      "vmla.f32   q15,  q3,   d1[1]      \n\t"
-      "subs       %[kc2], %[kc2], #1  \n\t"
-      "bge        loop_kc2_%=         \n\t"
-      "end_kc2_%=:                    \n\t"
-
-      "mov        r5,     %[c]        \n\t"
-      "mov        r6,     %[step]     \n\t"
-      "vst1.32    {q8, q9},   [r5], r6     \n\t"
-      "vst1.32    {q10, q11}, [r5], r6     \n\t"
-      "vst1.32    {q12, q13}, [r5], r6     \n\t"
-      "vst1.32    {q14, q15}, [r5]         \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9",
-        "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-void Gemm::AddDot6x8(int k, const float *a, const float *b, float *c, int ldc) {
-  const float *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int kc1 = k / 8;
-  int kc2 = k % 8;
-  int step = sizeof(float) * ldc;
-  asm volatile(
-      "pld        [%[a_ptr]]            \n\t"
-      "pld        [%[a_ptr],  #64]      \n\t"
-      "pld        [%[b_ptr]]            \n\t"
-      "pld        [%[b_ptr],  #64]      \n\t"
-
-      "vmov.f32   q4,     #0.0          \n\t"
-      "vmov.f32   q5,     #0.0          \n\t"
-      "vmov.f32   q6,     #0.0          \n\t"
-      "vmov.f32   q7,     #0.0          \n\t"
-      "vmov.f32   q8,     #0.0          \n\t"
-      "vmov.f32   q9,     #0.0          \n\t"
-      "vmov.f32   q10,    #0.0          \n\t"
-      "vmov.f32   q11,    #0.0          \n\t"
-      "vmov.f32   q12,    #0.0          \n\t"
-      "vmov.f32   q13,    #0.0          \n\t"
-      "vmov.f32   q14,    #0.0          \n\t"
-      "vmov.f32   q15,    #0.0          \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[a_ptr], #128]       \n\t"
-      "pld        [%[b_ptr], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "blt        4f                      \n\t"
-      "3:                                 \n\t"
-
-      "vld1.32    {d0-d2},  [%[a_ptr]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[b_ptr]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "bge        3b                      \n\t"
-      "4:                                 \n\t"
-
-      "mov        r5,     %[c]            \n\t"
-      "mov        r6,     %[step]         \n\t"
-      "vst1.32    {q4, q5},   [r5], r6    \n\t"
-      "vst1.32    {q6, q7},   [r5], r6    \n\t"
-      "vst1.32    {q8, q9},   [r5], r6    \n\t"
-      "vst1.32    {q10, q11}, [r5], r6    \n\t"
-      "vst1.32    {q12, q13}, [r5], r6    \n\t"
-      "vst1.32    {q14, q15}, [r5]        \n\t"
-
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-#if __ARM_NEON
-#if __aarch64__
-
-// 分块矩阵乘法结果回写
-// C = A * B
-void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = alpha * A * B + beta * C
-void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
-
-// C = A * B + C
-void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-// C = A * B + bias
-void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
-                          float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B + C, relu(C)
-void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv1 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv1);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = A * B + bias, relu(C)
-void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                              float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B + C,prelu(C)
-void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                             float *p, std::string mode, float *bias,
-                             float *bias1) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  float32x4_t biasv;
-  float32x4_t biasv1;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  float32x4_t pv;
-  float *ptr = p;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    if (bias1 == nullptr) {
-      biasv1 = zero;
-    } else {
-      biasv1 = vld1q_dup_f32(bias1 + i);
-    }
-
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vaddq_f32(cv, biasv1);
-      cv = vmaxq_f32(cv, zero);
-      cv1 = vminq_f32(cv, zero);
-      if (mode == "channel") {
-        cv1 = vmulq_n_f32(cv1, ptr[i]);
-      } else if (mode == "element") {
-        pv = vld1q_f32(ptr);
-        cv1 = vmulq_f32(cv1, pv);
-        ptr = ptr + 4;
-      } else {
-        cv1 = vmulq_n_f32(cv1, ptr[0]);
-      }
-      cv = vaddq_f32(cv, cv1);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vaddq_f32(cv, biasv1);
-      cv = vmaxq_f32(cv, zero);
-      cv1 = vminq_f32(cv, zero);
-      if (mode == "channel") {
-        cv1 = vmulq_n_f32(cv1, ptr[i]);
-      } else if (mode == "element") {
-        pv = vld1q_f32(ptr);
-        cv1 = vmulq_f32(cv1, pv);
-        ptr = ptr + 4;
-      } else {
-        cv1 = vmulq_n_f32(cv1, ptr[0]);
-      }
-      cv = vaddq_f32(cv, cv1);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B, batchnorm(C)
-void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                       float *new_scale, float *new_bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t cv1;
-  float32x4_t bias;
-  float32x2_t scale;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B, batchnorm(C), relu(C)
-void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                           float *new_scale, float *new_bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t bias;
-  float32x2_t scale;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmlaq_n_f32(bias, cv, scale0);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = A * B, batchnorm(C),C = C + bias; relu(C)
-void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                              float *new_scale, float *new_bias, float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr, *bias_ptr;
-  float32x4_t cv;
-  float32x4_t nbias;
-  float32x2_t scale;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias_ptr = bias + i * ldc;
-    nbias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-      bias_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-#else
-
-void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
-                        int lda, const float *B, int ldb, float beta, float *C,
-                        int ldc, bool relu) {
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
-
-  const float *a0, *b0, *b1, *b2, *b3;
-  float *c0, *C0;
-
-  int volatile kc1 = k / 4;
-  int volatile kc2 = k % 4;
-  int volatile nc1 = n / 16;
-  int _nc1 = n % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = _nc1 % 4;
-  for (int i = 0; i < kc1; i++) {
-    a0 = A + i * 4;
-    b0 = B + i * 4 * ldb;
-    b1 = b0 + ldb;
-    b2 = b1 + ldb;
-    b3 = b2 + ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {q0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        i_eq0_%=              \n\t"
-        "bne        i_ne0_%=              \n\t"
-
-        "i_eq0_%=:                        \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "vmov.f32   q11,    #0.0          \n\t"
-        "vmov.f32   q12,    #0.0          \n\t"
-        "vmov.f32   q13,    #0.0          \n\t"
-        "b          gemm_nc1_%=           \n\t"
-
-        "i_ne0_%=:                        \n\t"
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "pld        [%[b1], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
-        "vmla.f32   q10, q2, d0[1]        \n\t"
-        "vmla.f32   q11, q3, d0[1]        \n\t"
-        "vmla.f32   q12, q4, d0[1]        \n\t"
-        "vmla.f32   q13, q5, d0[1]        \n\t"
-
-        "pld        [%[b2], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
-        "vmla.f32   q10, q2, d1[0]        \n\t"
-        "vmla.f32   q11, q3, d1[0]        \n\t"
-        "vmla.f32   q12, q4, d1[0]        \n\t"
-        "vmla.f32   q13, q5, d1[0]        \n\t"
-
-        "pld        [%[b3], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
-        "vmla.f32   q10, q2, d1[1]        \n\t"
-        "vmla.f32   q11, q3, d1[1]        \n\t"
-        "vmla.f32   q12, q4, d1[1]        \n\t"
-        "vmla.f32   q13, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        ii_eq0_%=             \n\t"
-        "bne        ii_ne0_%=             \n\t"
-
-        "ii_eq0_%=:                       \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "b          gemm_nc2_%=           \n\t"
-
-        "ii_ne0_%=:                       \n\t"
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "pld        [%[b0], #16]          \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "pld        [%[b1], #16]          \n\t"
-        "vld1.32    {q3}, [%[b1]]!        \n\t"
-        "vmla.f32   q10, q3, d0[1]        \n\t"
-
-        "pld        [%[b2], #16]          \n\t"
-        "vld1.32    {q4}, [%[b2]]!        \n\t"
-        "vmla.f32   q10, q4, d1[0]        \n\t"
-
-        "pld        [%[b3], #16]          \n\t"
-        "vld1.32    {q5}, [%[b3]]!        \n\t"
-        "vmla.f32   q10, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      if (i == 0) {
-        *c0 = (*a0) * (*b0++);
-      } else {
-        *c0 += (*a0) * (*b0++);
-      }
-      *c0 += (*(a0 + 1)) * (*b1++);
-      *c0 += (*(a0 + 2)) * (*b2++);
-      *c0 += (*(a0 + 3)) * (*b3++);
-      c0++;
-    }
-  }
-
-  for (int i = 0; i < kc2; ++i) {
-    a0 = A + 4 * kc1 + i;
-    b0 = B + (4 * kc1 + i) * ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {d0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      *c0 += (*a0) * (*b0++);
-      c0++;
-    }
-  }
-
-  if (alpha != 1) {
-    VecWriteWithAlphaBeta(n, bufferC, C, ldc);
-    return;
-  }
-  if (beta == 0) {
-    VecWriteBasic(n, bufferC, C, ldc);
-    return;
-  }
-  if (beta == 1 && !relu) {
-    VecWriteWithAdd(n, bufferC, C, ldc);
-    return;
-  }
-  if (beta == 1 && relu) {
-    VecWriteWithAddRelu(n, bufferC, C, ldc);
-    return;
-  }
-}
-
-void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                              int lda, const float *B, int ldb, float beta,
-                              float *C, int ldc, bool relu, float *new_scale,
-                              float *new_bias) {
-  float *bufferC = static_cast<float *>(memory::Alloc(sizeof(float) * n));
-
-  const float *a0, *b0, *b1, *b2, *b3;
-  float *c0, *C0;
-
-  int volatile kc1 = k / 4;
-  int volatile kc2 = k % 4;
-  int volatile nc1 = n / 16;
-  int _nc1 = n % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = _nc1 % 4;
-  for (int i = 0; i < kc1; i++) {
-    a0 = A + i * 4;
-    b0 = B + i * 4 * ldb;
-    b1 = b0 + ldb;
-    b2 = b1 + ldb;
-    b3 = b2 + ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {q0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        i_eq0_%=              \n\t"
-        "bne        i_ne0_%=              \n\t"
-
-        "i_eq0_%=:                        \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "vmov.f32   q11,    #0.0          \n\t"
-        "vmov.f32   q12,    #0.0          \n\t"
-        "vmov.f32   q13,    #0.0          \n\t"
-        "b          gemm_nc1_%=           \n\t"
-
-        "i_ne0_%=:                        \n\t"
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "pld        [%[b1], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b1]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b1]]!    \n\t"
-        "vmla.f32   q10, q2, d0[1]        \n\t"
-        "vmla.f32   q11, q3, d0[1]        \n\t"
-        "vmla.f32   q12, q4, d0[1]        \n\t"
-        "vmla.f32   q13, q5, d0[1]        \n\t"
-
-        "pld        [%[b2], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b2]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b2]]!    \n\t"
-        "vmla.f32   q10, q2, d1[0]        \n\t"
-        "vmla.f32   q11, q3, d1[0]        \n\t"
-        "vmla.f32   q12, q4, d1[0]        \n\t"
-        "vmla.f32   q13, q5, d1[0]        \n\t"
-
-        "pld        [%[b3], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b3]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b3]]!    \n\t"
-        "vmla.f32   q10, q2, d1[1]        \n\t"
-        "vmla.f32   q11, q3, d1[1]        \n\t"
-        "vmla.f32   q12, q4, d1[1]        \n\t"
-        "vmla.f32   q13, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "cmp        %[i],       #0        \n\t"
-        "beq        ii_eq0_%=             \n\t"
-        "bne        ii_ne0_%=             \n\t"
-
-        "ii_eq0_%=:                       \n\t"
-        "vmov.f32   q10,    #0.0          \n\t"
-        "b          gemm_nc2_%=           \n\t"
-
-        "ii_ne0_%=:                       \n\t"
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "pld        [%[b0], #16]          \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "pld        [%[b1], #16]          \n\t"
-        "vld1.32    {q3}, [%[b1]]!        \n\t"
-        "vmla.f32   q10, q3, d0[1]        \n\t"
-
-        "pld        [%[b2], #16]          \n\t"
-        "vld1.32    {q4}, [%[b2]]!        \n\t"
-        "vmla.f32   q10, q4, d1[0]        \n\t"
-
-        "pld        [%[b3], #16]          \n\t"
-        "vld1.32    {q5}, [%[b3]]!        \n\t"
-        "vmla.f32   q10, q5, d1[1]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [i] "r"(i), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      if (i == 0) {
-        *c0 = (*a0) * (*b0++);
-      } else {
-        *c0 += (*a0) * (*b0++);
-      }
-      *c0 += (*(a0 + 1)) * (*b1++);
-      *c0 += (*(a0 + 2)) * (*b2++);
-      *c0 += (*(a0 + 3)) * (*b3++);
-      c0++;
-    }
-  }
-
-  for (int i = 0; i < kc2; ++i) {
-    a0 = A + 4 * kc1 + i;
-    b0 = B + (4 * kc1 + i) * ldb;
-    c0 = bufferC;
-    asm volatile(
-        "pld        [%[a0], #16]          \n\t"
-        "vld1.32    {d0}, [%[a0]]         \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "blt        end_nc1_%=            \n\t"
-        "loop_nc1_%=:                     \n\t"
-
-        "pld        [%[c0], #64]          \n\t"
-        "vld1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vld1.32    {q12, q13}, [%[c0]]   \n\t"
-        "sub        %[c0], %[c0], #32     \n\t"
-
-        "gemm_nc1_%=:                     \n\t"
-        "pld        [%[b0], #64]          \n\t"
-        "vld1.32    {q2, q3}, [%[b0]]!    \n\t"
-        "vld1.32    {q4, q5}, [%[b0]]!    \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-        "vmla.f32   q11, q3, d0[0]        \n\t"
-        "vmla.f32   q12, q4, d0[0]        \n\t"
-        "vmla.f32   q13, q5, d0[0]        \n\t"
-
-        "vst1.32    {q10, q11}, [%[c0]]!  \n\t"
-        "vst1.32    {q12, q13}, [%[c0]]!  \n\t"
-
-        "subs       %[nc1], %[nc1], #1    \n\t"
-        "bge        loop_nc1_%=           \n\t"
-        "end_nc1_%=:                      \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "blt        end_nc2_%=            \n\t"
-        "loop_nc2_%=:                     \n\t"
-
-        "pld        [%[c0], #16]          \n\t"
-        "vld1.32    {q10}, [%[c0]]        \n\t"
-
-        "gemm_nc2_%=:                     \n\t"
-        "vld1.32    {q2}, [%[b0]]!        \n\t"
-        "vmla.f32   q10, q2, d0[0]        \n\t"
-
-        "vst1.32    {q10}, [%[c0]]!       \n\t"
-
-        "subs       %[nc2], %[nc2], #1    \n\t"
-        "bge        loop_nc2_%=           \n\t"
-        "end_nc2_%=:                      \n\t"
-
-        : [b0] "+r"(b0), [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3),
-          [c0] "+r"(c0)
-        : [a0] "r"(a0), [nc1] "r"(nc1), [nc2] "r"(nc2)
-        : "memory", "q0", "q2", "q3", "q4", "q5", "q10", "q11", "q12", "q13");
-
-    for (int j = 0; j < nc3; j++) {
-      *c0 += (*a0) * (*b0++);
-      c0++;
-    }
-  }
-
-  if (relu) {
-    VecWriteWithBnRelu(n, bufferC, C, ldc, new_scale, new_bias);
-  } else {
-    VecWriteWithBn(n, bufferC, C, ldc, new_scale, new_bias);
-  }
-}
-
-// C = A * B
-void Gemm::WriteBasic(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int step = 4 * ldc;
-  int step1 = 4 * (NC - 16 * nc1);
-  int volatile m = mc;
-
-  float *volatile c_ptr, *volatile C_ptr;
-  float *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q0, q1}, [r6]!         \n\t"
-
-        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q2, q3}, [r6]!         \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (_nc1 != 0) {
-    for (int i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int j = 0; j < _nc1; j++) {
-        *C0++ = *c0++;
-      }
-    }
-  }
-}
-
-// C = alpha * A * B + beta * C
-void Gemm::WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc) {}
-
-// C = A * B + C
-void Gemm::WriteWithAdd(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int step = 4 * ldc;
-  int step1 = 4 * (NC - 16 * nc1);
-  int volatile m = mc;
-
-  float *volatile c_ptr, *volatile C_ptr;
-  float *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1},   [r6]        \n\t"
-        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q10,  q0,   q2          \n\t"
-        "vadd.f32   q11,  q1,   q3          \n\t"
-        "vst1.32    {q10, q11}, [r6]!       \n\t"
-
-        "vld1.32    {q4, q5},   [r6]        \n\t"
-        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q12,  q4,   q6          \n\t"
-        "vadd.f32   q13,  q5,   q7          \n\t"
-        "vst1.32    {q12, q13}, [r6]!       \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q10", "q11", "q12", "q13");
-  }
-
-  if (_nc1 != 0) {
-    for (int i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int j = 0; j < _nc1; j++) {
-        *C0++ += *c0++;
-      }
-    }
-  }
-}
-
-// C = A * B + bias
-void Gemm::WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc,
-                          float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-// C = A * B + C, relu(C)
-void Gemm::WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int step = 4 * ldc;
-  int step1 = 4 * (NC - 16 * nc1);
-  int volatile m = mc;
-
-  float *volatile c_ptr, *volatile C_ptr;
-  float *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "vmov.f32   q14,    #0.0            \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1},   [r6]        \n\t"
-        "vld1.32    {q2, q3},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q10,  q0,   q2          \n\t"
-        "vadd.f32   q11,  q1,   q3          \n\t"
-        "vmax.f32   q10,  q10,  q14         \n\t"
-        "vmax.f32   q11,  q11,  q14         \n\t"
-        "vst1.32    {q10, q11}, [r6]!       \n\t"
-
-        "vld1.32    {q4, q5},   [r6]        \n\t"
-        "vld1.32    {q6, q7},   [%[c_ptr]]! \n\t"
-        "vadd.f32   q12,  q4,   q6          \n\t"
-        "vadd.f32   q13,  q5,   q7          \n\t"
-        "vmax.f32   q12,  q12,  q14         \n\t"
-        "vmax.f32   q13,  q13,  q14         \n\t"
-        "vst1.32    {q12, q13}, [r6]!       \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]     \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]    \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q10", "q11", "q12", "q13");
-  }
-
-  if (_nc1 != 0) {
-    for (int i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int j = 0; j < _nc1; j++) {
-        *C0 += *c0;
-        if (*C0 < 0) {
-          *C0 = 0;
-        }
-        C0++;
-        c0++;
-      }
-    }
-  }
-}
-
-// C = A * B + bias, relu(C)
-void Gemm::WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                              float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr;
-  float32x4_t cv;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_f32(bias + i);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-        C_ptr++;
-      }
-    }
-  }
-}
-
-void Gemm::WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc,
-                             float *p, std::string mode, float *bias,
-                             float *bias1) {
-  if (nc < 4) {
-    if (bias1 == nullptr) {
-      for (int i = 0; i < mc; ++i) {
-        for (int j = 0; j < nc; ++j) {
-          float r = c[i * NC + j] + bias[i];
-          if (r < 0) {
-            r *= p[i];
-          }
-          C[i * ldc + j] = r;
-        }
-      }
-    } else {
-      for (int i = 0; i < mc; ++i) {
-        for (int j = 0; j < nc; ++j) {
-          float r = c[i * NC + j] + bias[i];
-          r += bias1[i * ldc + j];
-          if (r < 0) {
-            r *= p[i];
-          }
-          C[i * ldc + j] = r;
-        }
-      }
-    }
-    return;
-  }
-
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-  int step = 4 * (ldc - nc);
-  int step1 = 4 * (NC - nc);
-
-  if (bias1 == nullptr) {
-    asm volatile(
-        "vmov.f32   q14,    #0.0            \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r5,     %[nc1]          \n\t"
-        "mov        r6,     %[nc2]          \n\t"
-        "vld1.32    {d0},   [%[bias]]       \n\t"
-        "vld1.32    {d1},   [%[p]]          \n\t"
-        "vdup.32    q1,     d0[0]           \n\t"
-        "vdup.32    q2,     d1[0]           \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "pld        [%[c], #32]             \n\t"
-        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-        "vld1.32    {q9, q10},  [%[c]]!     \n\t"
-
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q9,   q9,   q1          \n\t"
-        "vadd.f32   q10,  q10,  q1          \n\t"
-
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-
-        "vmax.f32   q11,  q9,   q14         \n\t"
-        "vmin.f32   q13,  q9,   q14         \n\t"
-        "vmax.f32   q12,  q10,  q14         \n\t"
-        "vmin.f32   q15,  q10,  q14         \n\t"
-
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vmla.f32   q11,  q13,  q2          \n\t"
-        "vmla.f32   q12,  q15,  q2          \n\t"
-
-        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
-        "vst1.32    {q11, q12}, [%[C]]!     \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "subs       r6,  r6,   #1           \n\t"
-        "blt        end_nc2_%=              \n\t"
-        "loop_nc2_%=:                       \n\t"
-
-        "vld1.32    {q3},       [%[c]]!     \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vst1.32    {q5},       [%[C]]!     \n\t"
-
-        "subs       r6,   r6,   #1          \n\t"
-        "bge        loop_nc2_%=             \n\t"
-        "end_nc2_%=:                        \n\t"
-
-        "cmp        %[nc3],    #16          \n\t"
-        "beq        end_nc3_%=              \n\t"
-
-        "sub        %[c],     %[c],   %[nc3]      \n\t"
-        "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-        "vld1.32    {q4},       [%[c]]!     \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q6},       [%[C]]!     \n\t"
-        "end_nc3_%=:                        \n\t"
-
-        "add        %[p],     %[p],     #4        \n\t"
-        "add        %[bias],  %[bias],  #4        \n\t"
-        "add        %[c],     %[c],     %[step1]  \n\t"
-        "add        %[C],     %[C],     %[step]   \n\t"
-
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-          [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
-          [bias] "r"(bias), [bias1] "r"(bias1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q8");
-  } else {
-    asm volatile(
-        "vmov.f32   q14,    #0.0            \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r5,     %[nc1]          \n\t"
-        "mov        r6,     %[nc2]          \n\t"
-        "vld1.32    {d0},   [%[bias]]       \n\t"
-        "vld1.32    {d1},   [%[p]]          \n\t"
-        "vdup.32    q1,     d0[0]           \n\t"
-        "vdup.32    q2,     d1[0]           \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "pld        [%[c], #32]             \n\t"
-        "pld        [%[bias1], #32]         \n\t"
-        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q3,   q3,   q9          \n\t"
-        "vadd.f32   q4,   q4,   q10         \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
-
-        "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-        "vld1.32    {q9, q10},  [%[bias1]]! \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q3,   q3,   q9          \n\t"
-        "vadd.f32   q4,   q4,   q10         \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q5, q6},   [%[C]]!     \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "subs       r6,  r6,   #1           \n\t"
-        "blt        end_nc2_%=              \n\t"
-        "loop_nc2_%=:                       \n\t"
-
-        "vld1.32    {q3},       [%[c]]!     \n\t"
-        "vld1.32    {q9},       [%[bias1]]! \n\t"
-        "vadd.f32   q3,   q3,   q1          \n\t"
-        "vadd.f32   q3,   q3,   q9          \n\t"
-        "vmax.f32   q5,   q3,   q14         \n\t"
-        "vmin.f32   q7,   q3,   q14         \n\t"
-        "vmla.f32   q5,   q7,   q2          \n\t"
-        "vst1.32    {q5},      [%[C]]!      \n\t"
-
-        "subs       r6,   r6,   #1          \n\t"
-        "bge        loop_nc2_%=             \n\t"
-        "end_nc2_%=:                        \n\t"
-
-        "cmp        %[nc3],    #16          \n\t"
-        "beq        end_nc3_%=              \n\t"
-
-        "sub        %[c],     %[c],     %[nc3]    \n\t"
-        "sub        %[C],     %[C],     %[nc3]    \n\t"
-        "sub        %[bias1], %[bias1], %[nc3]    \n\t"
-
-        "vld1.32    {q4},       [%[c]]!     \n\t"
-        "vld1.32    {q10},      [%[bias1]]! \n\t"
-        "vadd.f32   q4,   q4,   q1          \n\t"
-        "vadd.f32   q4,   q4,   q10         \n\t"
-        "vmax.f32   q6,   q4,   q14         \n\t"
-        "vmin.f32   q8,   q4,   q14         \n\t"
-        "vmla.f32   q6,   q8,   q2          \n\t"
-        "vst1.32    {q6},       [%[C]]!     \n\t"
-        "end_nc3_%=:                        \n\t"
-
-        "add        %[p],     %[p],     #4        \n\t"
-        "add        %[bias],  %[bias],  #4        \n\t"
-        "add        %[c],     %[c],     %[step1]  \n\t"
-        "add        %[C],     %[C],     %[step]   \n\t"
-        "add        %[bias1], %[bias1], %[step]   \n\t"
-
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-          [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1), [p] "r"(p),
-          [bias] "r"(bias), [bias1] "r"(bias1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-          "q8", "q9", "q10");
-  }
-}
-
-// C = A * B, batchnorm(C)
-void Gemm::WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                       float *scale, float *bias) {
-  if (nc < 4) {
-    for (int i = 0; i < mc; ++i) {
-      for (int j = 0; j < nc; ++j) {
-        *C = (*c) * (*scale) + (*bias);
-        C++;
-        c++;
-      }
-      C += (ldc - nc);
-      c += (NC - nc);
-      scale++;
-      bias++;
-    }
-    return;
-  }
-
-  int volatile nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int volatile nc2 = _nc1 / 4;
-  int volatile nc3 = 16 - 4 * (_nc1 % 4);
-  int volatile step = 4 * (ldc - nc);
-  int volatile step1 = 4 * (NC - nc);
-
-  asm volatile(
-      "subs       %[mc], %[mc], #1        \n\t"
-      "blt        end_mc_%=               \n\t"
-      "loop_mc_%=:                        \n\t"
-
-      "mov        r5,   %[nc1]            \n\t"
-      "mov        r6,   %[nc2]            \n\t"
-      "vld1.32    {d0},   [%[scale]]      \n\t"
-      "vld1.32    {d1},   [%[bias]]       \n\t"
-      "vdup.32    q1,   d0[0]             \n\t"
-      "vdup.32    q2,   d1[0]             \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-      "vmul.f32   q10,  q3,   q1          \n\t"
-      "vmul.f32   q11,  q4,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
-      "vmul.f32   q12,  q5,   q1          \n\t"
-      "vmul.f32   q13,  q6,   q1          \n\t"
-      "vadd.f32   q12,  q12,  q2          \n\t"
-      "vadd.f32   q13,  q13,  q2          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       r6,  r6,   #1           \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q7},       [%[c]]!     \n\t"
-      "vmul.f32   q10,  q7,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vst1.32    {q10},      [%[C]]!     \n\t"
-
-      "subs       r6,   r6,   #1          \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q8},       [%[c]]!     \n\t"
-      "vmul.f32   q11,  q8,   q1          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vst1.32    {q11},      [%[C]]!     \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      "add        %[scale], %[scale], #4        \n\t"
-      "add        %[bias],  %[bias],  #4        \n\t"
-      "add        %[c],     %[c],     %[step1]  \n\t"
-      "add        %[C],     %[C],     %[step]   \n\t"
-
-      "subs       %[mc], %[mc], #1        \n\t"
-      "bge        loop_mc_%=              \n\t"
-      "end_mc_%=:                         \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q10", "q11", "q12", "q13");
-}
-
-// C = A * B, batchnorm(C), relu(C)
-void Gemm::WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                           float *scale, float *bias) {
-  if (nc < 4) {
-    for (int i = 0; i < mc; ++i) {
-      for (int j = 0; j < nc; ++j) {
-        *C = (*c) * (*scale) + (*bias);
-        if (*C < 0) {
-          *C = 0;
-        }
-        C++;
-        c++;
-      }
-      C += (ldc - nc);
-      c += (NC - nc);
-      scale++;
-      bias++;
-    }
-    return;
-  }
-
-  int nc1 = nc / 16;
-  int _nc1 = nc % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-  int step = 4 * (ldc - nc);
-  int step1 = 4 * (NC - nc);
-
-  asm volatile(
-      "vmov.f32   q14,    #0.0            \n\t"
-      "subs       %[mc], %[mc], #1        \n\t"
-      "blt        end_mc_%=               \n\t"
-      "loop_mc_%=:                        \n\t"
-
-      "mov        r5,   %[nc1]            \n\t"
-      "mov        r6,   %[nc2]            \n\t"
-      "vld1.32    {d0},   [%[scale]]      \n\t"
-      "vld1.32    {d1},   [%[bias]]       \n\t"
-      "vdup.32    q1,   d0[0]             \n\t"
-      "vdup.32    q2,   d1[0]             \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q3, q4},   [%[c]]!     \n\t"
-      "vmul.f32   q10,  q3,   q1          \n\t"
-      "vmul.f32   q11,  q4,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q5, q6},   [%[c]]!     \n\t"
-      "vmul.f32   q12,  q5,   q1          \n\t"
-      "vmul.f32   q13,  q6,   q1          \n\t"
-      "vadd.f32   q12,  q12,  q2          \n\t"
-      "vadd.f32   q13,  q13,  q2          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       r5,   r5,   #1          \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       r6,  r6,   #1           \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q7},       [%[c]]!     \n\t"
-      "vmul.f32   q10,  q7,   q1          \n\t"
-      "vadd.f32   q10,  q10,  q2          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vst1.32    {q10},      [%[C]]!     \n\t"
-
-      "subs       r6,   r6,   #1          \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q8},       [%[c]]!     \n\t"
-      "vmul.f32   q11,  q8,   q1          \n\t"
-      "vadd.f32   q11,  q11,  q2          \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q11},      [%[C]]!     \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      "add        %[scale], %[scale], #4        \n\t"
-      "add        %[bias],  %[bias],  #4        \n\t"
-      "add        %[c],     %[c],     %[step1]  \n\t"
-      "add        %[C],     %[C],     %[step]   \n\t"
-
-      "subs       %[mc], %[mc], #1        \n\t"
-      "bge        loop_mc_%=              \n\t"
-      "end_mc_%=:                         \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [mc] "r"(mc), [nc1] "r"(nc1), [nc2] "r"(nc2),
-        [nc3] "r"(nc3), [step] "r"(step), [step1] "r"(step1),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q10", "q11", "q12", "q13", "q14");
-}
-
-// C = A * B, batchnorm(C),C = C + bias; relu(C)
-void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                              float *new_scale, float *new_bias, float *bias) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float *c_ptr, *C_ptr, *bias_ptr;
-  float32x4_t cv;
-  float32x4_t nbias;
-  float32x2_t scale;
-  float32x4_t biasv;
-  float32x4_t zero = vdupq_n_f32(0.0);
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias_ptr = bias + i * ldc;
-    nbias = vld1q_dup_f32(new_bias);
-    scale = vld1_dup_f32(new_scale);
-    new_bias++;
-    new_scale++;
-    float scale0 = vget_lane_f32(scale, 0);
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-      bias_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      biasv = vld1q_f32(bias_ptr);
-      cv = vmlaq_n_f32(nbias, cv, scale0);
-      cv = vaddq_f32(cv, biasv);
-      cv = vmaxq_f32(cv, zero);
-      if (_nc1 >= 1) {
-        vst1q_lane_f32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_f32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_f32(C_ptr, cv, 2);
-      }
-    }
-  }
-}
-
-// C = A * B
-void Gemm::VecWriteBasic(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1}, [%[c]]!       \n\t"
-      "vst1.32    {q0, q1}, [%[C]]!       \n\t"
-
-      "vld1.32    {q2, q3}, [%[c]]!       \n\t"
-      "vst1.32    {q2, q3}, [%[C]]!       \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q4},     [%[c]]!       \n\t"
-      "vst1.32    {q4},     [%[C]]!       \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-      "sub        %[c],     %[c],   %[nc3]    \n\t"
-      "sub        %[C],     %[C],   %[nc3]    \n\t"
-      "vld1.32    {q5},     [%[c]]!       \n\t"
-      "vst1.32    {q5},     [%[C]]!       \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-}
-
-// C = alpha * A * B + beta * C
-void Gemm::VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc) {}
-
-// C = A * B + C
-void Gemm::VecWriteWithAdd(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-
-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C++ += *c++;
-    }
-  }
-}
-
-// C = A * B + C, relu(C)
-void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-
-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[C]]      \n\t"
-      "vadd.f32   q10,  q0,   q2          \n\t"
-      "vadd.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[C]]      \n\t"
-      "vadd.f32   q12,  q4,   q6          \n\t"
-      "vadd.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      : [C] "+r"(C), [c] "+r"(c)
-      : [nc1] "r"(nc1)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-
-  if (_nc1 != 0) {
-    for (int j = 0; j < _nc1; j++) {
-      *C += *c;
-      if (*C < 0) {
-        *C = 0;
-      }
-      C++;
-      c++;
-    }
-  }
-}
-
-// C = A * B, batchnorm(C)
-void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
-                          float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13");
-}
-
-// C = A * B, batchnorm(C), relu(C)
-void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *scale,
-                              float *bias) {
-  int nc1 = n / 16;
-  int _nc1 = n % 16;
-  int nc2 = _nc1 / 4;
-  int nc3 = 16 - 4 * (_nc1 % 4);
-
-  asm volatile(
-      "vmov.f32   q14,      #0.0          \n\t"
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "blt        end_nc1_%=              \n\t"
-      "loop_nc1_%=:                       \n\t"
-
-      "vld1.32    {q0, q1},   [%[c]]!     \n\t"
-      "vld1.32    {q2, q3},   [%[scale]]! \n\t"
-      "vld1.32    {q10, q11}, [%[bias]]!  \n\t"
-      "vmla.f32   q10,  q0,   q2          \n\t"
-      "vmla.f32   q11,  q1,   q3          \n\t"
-      "vmax.f32   q10,  q10,  q14         \n\t"
-      "vmax.f32   q11,  q11,  q14         \n\t"
-      "vst1.32    {q10, q11}, [%[C]]!     \n\t"
-
-      "vld1.32    {q4, q5},   [%[c]]!     \n\t"
-      "vld1.32    {q6, q7},   [%[scale]]! \n\t"
-      "vld1.32    {q12, q13}, [%[bias]]!  \n\t"
-      "vmla.f32   q12,  q4,   q6          \n\t"
-      "vmla.f32   q13,  q5,   q7          \n\t"
-      "vmax.f32   q12,  q12,  q14         \n\t"
-      "vmax.f32   q13,  q13,  q14         \n\t"
-      "vst1.32    {q12, q13}, [%[C]]!     \n\t"
-
-      "subs       %[nc1],   %[nc1],   #1  \n\t"
-      "bge        loop_nc1_%=             \n\t"
-      "end_nc1_%=:                        \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "blt        end_nc2_%=              \n\t"
-      "loop_nc2_%=:                       \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-
-      "subs       %[nc2],   %[nc2],   #1  \n\t"
-      "bge        loop_nc2_%=             \n\t"
-      "end_nc2_%=:                        \n\t"
-
-      "cmp        %[nc3],    #16          \n\t"
-      "beq        end_nc3_%=              \n\t"
-
-      "sub        %[c],     %[c],   %[nc3]      \n\t"
-      "sub        %[scale], %[scale],  %[nc3]   \n\t"
-      "sub        %[bias],  %[bias],   %[nc3]   \n\t"
-      "sub        %[C],     %[C],   %[nc3]      \n\t"
-
-      "vld1.32    {q0},   [%[c]]!         \n\t"
-      "vld1.32    {q1},   [%[scale]]!     \n\t"
-      "vld1.32    {q10},  [%[bias]]!      \n\t"
-      "vmla.f32   q10,    q0,   q1        \n\t"
-      "vmax.f32   q10,    q10,  q14       \n\t"
-      "vst1.32    {q10},  [%[C]]!         \n\t"
-      "end_nc3_%=:                        \n\t"
-
-      :
-      : [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3] "r"(nc3),
-        [scale] "r"(scale), [bias] "r"(bias)
-      : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11",
-        "q12", "q13", "q14");
-}
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-// 32位 float 矩阵乘法
-void Gemm::Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 32 * 1024;
-  int L2 = 512 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(float));
-  NC = L2 / (KC * sizeof(float));
-
-  // make sure MC is multiple of MR, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR;
-  } else {
-    int mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR - 1) / MR * MR;
-  }
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#else
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#endif
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-#if __aarch64__
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-#else
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-#endif
-      if (bias == nullptr) {
-        InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
-                            &C(i, j), ldc, relu, nullptr);
-      } else {
-        InnerKernelWithBias(mc, nc, alpha, packedA, packedB, beta, packedC,
-                            &C(i, j), ldc, relu, bias + i);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithBn(int m, int n, int k, float alpha, const float *A,
-                       int lda, const float *B, int ldb, float beta, float *C,
-                       int ldc, bool relu, float *new_scale, float *new_bias,
-                       float *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 32 * 1024;
-  int L2 = 512 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(float));
-  NC = L2 / (KC * sizeof(float));
-
-  // make sure MC is multiple of MR, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR;
-  } else {
-    int mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR - 1) / MR * MR;
-  }
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#else
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#endif
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-#if __aarch64__
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-#else
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-#endif
-      if (bias == nullptr) {
-        InnerKernelWithBn(mc, nc, alpha, packedA, packedB, beta, packedC,
-                          &C(i, j), ldc, relu, new_scale + i, new_bias + i);
-      } else {
-        InnerKernelWithBnAdd(mc, nc, alpha, packedA, packedB, beta, packedC,
-                             &C(i, j), ldc, relu, new_scale + i, new_bias + i,
-                             bias + i * ldc + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                          const float *B, int ldb, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int L1 = 32 * 1024;
-  int L2 = 0.5 * 1024 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(float));
-  NC = L2 / (KC * sizeof(float));
-
-  // make sure MC is multiple of MR, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR;
-  } else {
-    int mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR - 1) / MR * MR;
-  }
-  //  DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-
-  packedA = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-  packedB = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC));
-
-  int mc, nc;
-  for (int j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    // PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
-    PackMatrixB_16c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#else
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB, false);
-#endif
-    for (int i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-#if __aarch64__
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-      // PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
-#else
-      PackMatrixA_6r(mc, KC, mc % MR, &A(i, 0), lda, packedA, false);
-#endif
-      if (bias1 == nullptr) {
-        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
-                             p + i, mode, bias + i, nullptr);
-      } else {
-        InnerKernelWithPRelu(mc, nc, packedA, packedB, packedC, &C(i, j), ldc,
-                             p + i, mode, bias + i, bias1 + i * ldc + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-// 32位 float 矩阵乘法
-void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                     const float *B, int ldb, float beta, float *C, int ldc,
-                     bool relu, float *bias) {
-#ifndef __aarch64__
-  if (m == 1 && bias == nullptr) {
-    return VectorKernel(m, n, k, alpha, A, lda, B, ldb, beta, C, ldc, relu);
-  }
-#endif  // __aarch64__
-#ifdef _OPENMP
-  int max_threads = omp_get_max_threads();
-#else
-  int max_threads = 1;
-#endif
-
-  //  int L1 = 64 / max_threads * 1024;
-  int L = (max_threads > 2) ? 64 : 32;
-  int L1 = L / max_threads * 1024;
-  KC = k;
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(float));
-    if (MC == 0) {
-      MC = MR;
-    } else {
-      int mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR - 1) / MR * MR;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true);
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(float));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR - 1) / MR * MR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true);
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
-  }
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int mc;
-      mc = s_min(m - i, MC);
-      float *local_A = packedA + MC * KC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false);
-      if (bias == nullptr) {
-        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
-                            &C(i, 0), ldc, relu, nullptr);
-      } else {
-        InnerKernelWithBias(mc, n, alpha, local_A, packedB, beta, local_C,
-                            &C(i, 0), ldc, relu, bias + i);
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int nc;
-      nc = s_min(n - j, NC);
-      float *local_B = packedB + KC * NC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false);
-      InnerKernelWithBias(m, nc, alpha, packedA, local_B, beta, local_C,
-                          &C(0, j), ldc, relu, bias);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
-                           int lda, const float *B, int ldb, float beta,
-                           float *C, int ldc, bool relu, float *new_scale,
-                           float *new_bias, float *bias) {
-#ifdef _OPENMP
-  int max_threads = omp_get_max_threads();
-#else
-  int max_threads = 1;
-#endif
-
-  int L1 = 64 / max_threads * 1024;
-  KC = k;
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(float));
-    if (MC == 0) {
-      MC = MR;
-    } else {
-      int mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR - 1) / MR * MR;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true);
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(float));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR - 1) / MR * MR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true);
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
-  }
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int mc;
-      mc = s_min(m - i, MC);
-      float *local_A = packedA + MC * KC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false);
-      if (bias == nullptr) {
-        InnerKernelWithBn(mc, n, alpha, local_A, packedB, beta, local_C,
-                          &C(i, 0), ldc, relu, new_scale + i, new_bias + i);
-      } else {
-        InnerKernelWithBnAdd(mc, n, alpha, local_A, packedB, beta, local_C,
-                             &C(i, 0), ldc, relu, new_scale + i, new_bias + i,
-                             bias + i * ldc);
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int nc;
-      nc = s_min(n - j, NC);
-      float *local_B = packedB + KC * NC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false);
-      if (bias == nullptr) {
-        InnerKernelWithBn(m, nc, alpha, packedA, local_B, beta, local_C,
-                          &C(0, j), ldc, relu, new_scale, new_bias);
-      } else {
-        InnerKernelWithBnAdd(m, nc, alpha, packedA, local_B, beta, local_C,
-                             &C(0, j), ldc, relu, new_scale, new_bias,
-                             bias + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                              const float *B, int ldb, float *C, int ldc,
-                              float *p, std::string mode, float *bias,
-                              float *bias1) {
-#ifdef _OPENMP
-  int max_threads = omp_get_max_threads();
-#else
-  int max_threads = 1;
-#endif
-
-  int L1 = 8 * 1024;
-  KC = k;
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(float));
-    if (MC == 0) {
-      MC = MR;
-    } else {
-      int mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR - 1) / MR * MR;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC));
-    (*this.*procPackB)(KC, n, n % NR, B, ldb, packedB, true);
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(float));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR - 1) / MR * MR;
-
-#if __aarch64__
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_16c;
-    procAddDot = &Gemm::AddDot6x16;
-#else
-    procPackA = &Gemm::PackMatrixA_6r;
-    procPackB = &Gemm::PackMatrixB_8c;
-    procAddDot = &Gemm::AddDot6x8;
-#endif
-
-    packedA = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * MC * KC));
-    (*this.*procPackA)(m, KC, m % MR, A, lda, packedA, true);
-    packedB = static_cast<float *>(
-        paddle_mobile::memory::Alloc(sizeof(float) * KC * NC * max_threads));
-  }
-  packedC = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int mc;
-      mc = s_min(m - i, MC);
-      float *local_A = packedA + MC * KC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackA)(mc, KC, mc % MR, &A(i, 0), lda, local_A, false);
-      if (bias1 == nullptr) {
-        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
-                             p + i, mode, bias + i, nullptr);
-      } else {
-        InnerKernelWithPRelu(mc, n, local_A, packedB, local_C, &C(i, 0), ldc,
-                             p + i, mode, bias + i, bias1 + i * ldc);
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int local_threads = omp_get_thread_num();
-#else
-      int local_threads = 0;
-#endif
-
-      int nc;
-      nc = s_min(n - j, NC);
-      float *local_B = packedB + KC * NC * local_threads;
-      float *local_C = packedC + MC * NC * local_threads;
-      (*this.*procPackB)(KC, nc, nc % NR, &B(0, j), ldb, local_B, false);
-      if (bias1 == nullptr) {
-        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
-                             mode, bias, nullptr);
-      } else {
-        InnerKernelWithPRelu(m, nc, packedA, local_B, local_C, &C(0, j), ldc, p,
-                             mode, bias, bias1 + j);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA);
-  paddle_mobile::memory::Free(packedB);
-  paddle_mobile::memory::Free(packedC);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm.h b/mobile/src/operators/math/gemm.h
deleted file mode 100644
index fdbae47112feda44782438b97027dfaabca127a2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm.h
+++ /dev/null
@@ -1,492 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cstring>
-#include <string>
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-// 矩阵取值运算宏，假设矩阵按行存储
-#define A(i, j) A[(i)*lda + (j)]
-#define B(i, j) B[(i)*ldb + (j)]
-#define C(i, j) C[(i)*ldc + (j)]
-
-#if __aarch64__
-#define MR_INT8 4
-#define NR_INT8 4
-#define MR 6
-#define NR 16
-#else
-#define MR_INT8 4
-#define NR_INT8 2
-#define MR 6
-#define NR 8
-#endif
-
-#define s_min(i, j) ((i) < (j) ? (i) : (j))
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-class Gemm {
- public:
-  typedef void (Gemm::*FnPack)(int, int, int, const float *, int, float *,
-                               const bool);
-  typedef void (Gemm::*FnAddDot)(int, const float *, const float *, float *,
-                                 int);
-  FnPack procPackA;
-  FnPack procPackB;
-  FnAddDot procAddDot;
-
-  void PackMatrixA_6r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer, const bool parallel);
-  void PackMatrixA_8r(int m, int k, int m_tail, const float *A, int lda,
-                      float *buffer, const bool parallel);
-  void PackMatrixB_8c(int k, int n, int n_tail, const float *B, int ldb,
-                      float *buffer, const bool parallel);
-#if __aarch64__
-  void PackMatrixB_12c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer, const bool parallel);
-  void PackMatrixB_16c(int k, int n, int n_tail, const float *B, int ldb,
-                       float *buffer, const bool parallel);
-#endif
-
-  // 分块矩阵乘法
-  void InnerKernel(int mc, int nc, float alpha, const float *a, const float *b,
-                   float beta, float *c, float *C, int ldc, bool relu);
-  void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
-                           const float *b, float beta, float *c, float *C,
-                           int ldc, bool relu, float *bias);
-
-  void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
-                         const float *b, float beta, float *c, float *C,
-                         int ldc, bool relu, float *new_scale, float *new_bias);
-  void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
-                            const float *b, float beta, float *c, float *C,
-                            int ldc, bool relu, float *new_scale,
-                            float *new_bias, float *bias);
-  void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
-                            float *c, float *C, int ldc, float *p,
-                            std::string mode, float *bias, float *bias1);
-
-  // 计算一个更小的 C 矩阵分块
-#if __aarch64__
-  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot8x12(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot6x16(int k, const float *a, const float *b, float *c, int ldc);
-#else
-  void AddDot4x4(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot4x8(int k, const float *a, const float *b, float *c, int ldc);
-  void AddDot6x8(int k, const float *a, const float *b, float *c, int ldc);
-#endif
-
-  // 分块矩阵乘法结果回写
-  // C = A * B
-  void WriteBasic(int mc, int nc, float *c, float *C, int ldc);
-  // C = alpha * A * B + beta * C
-  void WriteWithAlphaBeta(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B + C
-  void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B + bias
-  void WriteWithAddV1(int mc, int nc, float *c, float *C, int ldc, float *bias);
-  // C = A * B + C, relu(C)
-  void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc);
-  // C = A * B + C,prelu(C)
-  void WriteWithAddPRelu(int mc, int nc, float *c, float *C, int ldc, float *p,
-                         std::string mode, float *bias, float *bias1);
-  // C = A * B + bias ,relu(C)
-  void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
-                          float *bias);
-  // C = A * B, batchnorm(C)
-  void WriteWithBn(int mc, int nc, float *c, float *C, int ldc,
-                   float *new_scale, float *new_bias);
-  // C = A * B, batchnorm(C), relu(C)
-  void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc,
-                       float *new_scale, float *new_bias);
-  void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
-                          float *new_scale, float *new_bias, float *bias1);
-
-  // 向量矩阵乘法 (M = 1)
-#if __aarch64__
-#else
-  void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
-                    const float *B, int ldb, float beta, float *C, int ldc,
-                    bool relu);
-
-  void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
-                          int lda, const float *B, int ldb, float beta,
-                          float *C, int ldc, bool relu, float *new_scale,
-                          float *new_bias);
-
-  // 向量矩阵乘法结果回写
-  // C = A * B
-  void VecWriteBasic(int n, float *c, float *C, int ldc);
-  // C = alpha * A * B + beta * C
-  void VecWriteWithAlphaBeta(int n, float *c, float *C, int ldc);
-  // C = A * B + C
-  void VecWriteWithAdd(int n, float *c, float *C, int ldc);
-  // C = A * B + C, relu(C)
-  void VecWriteWithAddRelu(int n, float *c, float *C, int ldc);
-  // C = A * B, batchnorm(C)
-  void VecWriteWithBn(int n, float *c, float *C, int ldc, float *new_scale,
-                      float *new_bias);
-  // C = A * B, batchnorm(C), relu(C)
-  void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float *new_scale,
-                          float *new_bias);
-#endif
-
-  // 32位 float 矩阵乘法
-  void Sgemm(int m, int n, int k, float alpha, const float *A, int lda,
-             const float *B, int ldb, float beta, float *C, int ldc, bool relu,
-             float *bias);
-
-  // 32位 float 矩阵乘法, 并对结果进行 batchnrom
-  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
-                   const float *B, int ldb, float beta, float *C, int ldc,
-                   bool relu, float *new_scale, float *new_bias, float *bias);
-
-  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
-                      const float *B, int ldb, float *C, int ldc, float *p,
-                      std::string mode, float *bias, float *bias1);
-
-  // 32位 float 矩阵乘法（openmp 多线程版本）
-  void Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
-                 const float *B, int ldb, float beta, float *C, int ldc,
-                 bool relu, float *bias);
-
-  // 32位 float 矩阵乘法, 并对结果进行 batchnrom（openmp 多线程版本）
-  void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A,
-                       int lda, const float *B, int ldb, float beta, float *C,
-                       int ldc, bool relu, float *new_scale, float *new_bias,
-                       float *bias);
-
-  void SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
-                          const float *B, int ldb, float *C, int ldc, float *p,
-                          std::string mode, float *bias, float *bias1);
-
-  // 8 bits function cluster begins
-  // 8 bits int small block inner product, data packed k = 1
-  void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-  // 8 bits int small block inner product, data packed k = 16
-  void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-  void AddDot4x4(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
-
-  // 8 bits int inner product
-  template <typename Otype>
-  void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                   const int8_t *b, float beta, int32_t *c, Otype *C,
-                   int32_t ldc, bool relu);
-  template <typename Otype>
-  void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                           const int8_t *b, float beta, int32_t *c, Otype *C,
-                           int32_t ldc, bool relu, int32_t *bias,
-                           bool addOnRow = false);
-
-  // 8 bits int pack function
-  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                      int32_t lda, int8_t *buffer);
-  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                      int32_t lda, int8_t *buffer);
-  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                      int32_t ldb, int8_t *buffer);
-  void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                         int32_t lda, int8_t *buffer);
-  void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                         int32_t ldb, int8_t *buffer);
-  void PackMatrixB_4c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                         int32_t ldb, int8_t *buffer);
-  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                          int32_t lda, int8_t *buffer);
-  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                          int32_t ldb, int8_t *buffer);
-  void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                             const int8_t *A, int32_t lda, int8_t *buffer);
-  void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer);
-  void PackMatrixB_omp_4c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer);
-
-  // 8 bits int matrix product
-  template <typename Itype, typename Btype, typename Otype>
-  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
-                 int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
-                 int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
-  template <typename Otype>
-  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
-                 bool addOnRow = false);
-  template <typename Itype, typename Btype, typename Otype>
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
-             int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
-  template <typename Otype>
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
-             int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false);
-  // 8 bits int write back
-  // C = A * B
-  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
-  // C = A * B + bias, scale * relu(C)
-  void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                             int32_t ldc, int32_t *bias, float scale);
-  // C = A * B + bias, scale * C, bias is added on column
-  void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                         int32_t ldc, int32_t *bias, float scale);
-  // C = A * B + bias, scale * C, bias is added on row
-  void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                          int32_t ldc, int32_t *bias, float scale);
-
- private:
-  int MC = 0;
-  int KC = 0;
-  int NC = 0;
-
-  // 32位 float
-  float *packedA;
-  float *packedB;
-  float *packedC;
-
-  // 8 bits int
-  int8_t *packedA_int8;
-  int8_t *packedB_int8;
-  int32_t *packedC_int32;
-  int8_t *zero_int8;
-};
-
-// 8 bits int matrix product (m*k x k*n)
-template <typename Otype>
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
-                 bool addOnRow) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int32_t L1 = 32 * 1024;
-  int32_t L2 = 512 * 1024;
-
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  MC = L1 / (KC * sizeof(int8_t));
-  NC = L2 / (KC * sizeof(int8_t));
-
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
-  if (MC == 0) {
-    MC = MR_INT8;
-  } else {
-    int32_t mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-  }
-  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR_INT8;
-  } else {
-    int32_t nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-  packedA_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-  packedB_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  int32_t mc, nc;
-  for (int32_t j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-#if __aarch64__
-    PackMatrixB_4c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-#else
-    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-#endif
-    for (int32_t i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      if (bias == nullptr) {
-        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                    packedC_int32, &C(i, j), ldc, relu);
-      } else {
-        if (addOnRow) {
-          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                              packedC_int32, &C(i, j), ldc, relu, bias + j,
-                              addOnRow);
-        } else {
-          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                              packedC_int32, &C(i, j), ldc, relu, bias + i,
-                              addOnRow);
-        }
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
-
-// 8 bits int matrix product (m*k x k*n), omp version
-template <typename Otype>
-void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
-                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
-                     float beta, Otype *C, int32_t ldc, bool relu,
-                     int32_t *bias, bool addOnRow) {
-#ifdef _OPENMP
-  int32_t max_threads = omp_get_max_threads();
-#else
-  int32_t max_threads = 1;
-#endif
-
-  int32_t L1 = 64 / max_threads * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(int8_t));
-    if (MC == 0) {
-      MC = MR_INT8;
-    } else {
-      int32_t mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-    }
-    // 补齐 B
-    NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-
-    packedB_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-#if __aarch64__
-    PackMatrixB_omp_4c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
-#else
-    PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
-#endif
-    packedA_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(int8_t));
-    if (NC == 0) {
-      NC = NR_INT8;
-    } else {
-      int32_t nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-    }
-    // 补齐 A
-    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-
-    packedA_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-#if __aarch64__
-    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
-#else
-    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
-#endif
-    packedB_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
-  }
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int32_t i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int32_t local_threads = omp_get_thread_num();
-#else
-      int32_t local_threads = 0;
-#endif
-
-      int32_t mc;
-      mc = s_min(m - i, MC);
-      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
-      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
-#if __aarch64__
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
-#else
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
-#endif
-      if (bias == nullptr) {
-        InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
-                    &C(i, 0), ldc, relu);
-      } else {
-        if (addOnRow) {
-          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
-                              local_C, &C(i, 0), ldc, relu, bias, addOnRow);
-        } else {
-          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
-                              local_C, &C(i, 0), ldc, relu, bias + i, addOnRow);
-        }
-      }
-    }
-  } else {
-#pragma omp parallel for
-    for (int32_t j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int32_t local_threads = omp_get_thread_num();
-#else
-      int32_t local_threads = 0;
-#endif
-      int32_t nc;
-      nc = s_min(n - j, NC);
-      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
-      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
-#if __aarch64__
-      PackMatrixB_4c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
-#else
-      PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
-#endif
-      if (bias == nullptr) {
-        InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
-                    &C(0, j), ldc, relu);
-      } else {
-        if (addOnRow) {
-          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
-                              local_C, &C(0, j), ldc, relu, bias + j, addOnRow);
-        } else {
-          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
-                              local_C, &C(0, j), ldc, relu, bias, addOnRow);
-        }
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm/cblas.cc b/mobile/src/operators/math/gemm/cblas.cc
deleted file mode 100644
index 4428826552a6a05fc0ee64f173e81d60405d1249..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/cblas.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include "operators/math/gemm/cblas.h"
-#include "operators/math/gemm/executor.h"
-#include "operators/math/gemm/strategy.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void cblas_sgemm(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc) {
-  if (N == 1) {
-    return cblas_sgemv(transA, M, K, alpha, A, lda, B, beta, C);
-  } else if (M == 1) {
-    return cblas_sgemv(!transB, N, K, alpha, B, ldb, A, beta, C);
-  } else {
-    GemmExecutor<SgemmStrategy> exec(transA, transB, M, N, K);
-    exec(alpha, A, lda, B, ldb, beta, C, ldc);
-  }
-}
-
-void cblas_sgemv(const bool trans, const int M, const int N, const float alpha,
-                 const float *A, const int lda, const float *B,
-                 const float beta, float *C) {
-  GemvExecutor<SgemvStrategy> exec(trans, M, N);
-  exec(alpha, A, lda, B, beta, C);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/gemm/cblas.h b/mobile/src/operators/math/gemm/cblas.h
deleted file mode 100644
index c7c9201869f56a7d339cccfcb3d898a4751836a6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/cblas.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void cblas_sgemm(const bool transA, const bool transB, const int M, const int N,
-                 const int K, const float alpha, const float *A, const int lda,
-                 const float *B, const int ldb, const float beta, float *C,
-                 const int ldc);
-
-void cblas_sgemv(const bool trans, const int M, const int N, const float alpha,
-                 const float *A, const int lda, const float *B,
-                 const float beta, float *C);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm/executor.h b/mobile/src/operators/math/gemm/executor.h
deleted file mode 100644
index 976415b9ac1e3d0761ae11588c27ee9b99156d1f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/executor.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-// #include <sys/time.h>
-#include <iostream>
-#include "common/log.h"
-#include "framework/context.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm/gemm_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-int CeilDiv(const int &x, const int &y) { return (x + y - 1) / y; }
-unsigned int ResetL1Cache(const unsigned int L1_size, const int thread_num,
-                          const int N, const int K) {
-  unsigned int L1 = L1_size;
-  if (thread_num == 1) {
-    if (N >= 30000 && K > 100) {
-      L1 *= 4;
-    } else if (N >= 10000 && K > 100) {
-      L1 *= 2;
-    }
-  }
-  return L1;
-}
-
-class Executor {
- public:
-  Executor() : num_threads_(1) {
-#ifdef _OPENMP
-    num_threads_ = omp_get_max_threads();
-#endif
-  }
-  virtual ~Executor() {}
-
- protected:
-  int num_threads_;
-};
-
-template <typename Strategy>
-class GemmExecutor : public Executor {
-  typedef typename Strategy::Itype Itype;
-  typedef typename Strategy::Otype Otype;
-
- public:
-  GemmExecutor(const bool transA, const bool transB, const int M, const int N,
-               const int K)
-      : Executor(), transA_(transA), transB_(transB), M_(M), N_(N), K_(K) {
-    unsigned int L1_size = 0;
-    unsigned int L2_size = 0;
-    if (M_ > N_) {
-      L2_size =
-          ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(),
-                       num_threads_, M_, K_);
-      L1_size = framework::CPUContext::Context()->get_l2_cache_size();
-    } else {
-      L1_size =
-          ResetL1Cache(framework::CPUContext::Context()->get_l1_cache_size(),
-                       num_threads_, N_, K_);
-      L2_size = framework::CPUContext::Context()->get_l2_cache_size();
-    }
-
-    rhs_tile_num_ = L1_size / (K_ * sizeof(Itype));
-    if (rhs_tile_num_ == 0) {
-      rhs_tile_num_ = Strategy::out_width();
-    } else {
-      int n_block = CeilDiv(N_, rhs_tile_num_);
-      rhs_tile_num_ = CeilDiv(N_, n_block);
-      rhs_tile_num_ = CeilDiv(rhs_tile_num_, Strategy::out_width());
-      rhs_tile_num_ *= Strategy::out_width();
-    }
-
-    //  lhs_tile_num_ = CeilDiv(M, Strategy::out_height()) *
-    //  Strategy::out_height();
-    lhs_tile_num_ = L2_size / (K_ * sizeof(Itype));
-    if (lhs_tile_num_ == 0) {
-      lhs_tile_num_ = Strategy::out_height();
-    } else {
-      int m_block = CeilDiv(M_, lhs_tile_num_);
-      lhs_tile_num_ = CeilDiv(M_, m_block);
-      lhs_tile_num_ = CeilDiv(lhs_tile_num_, Strategy::out_height());
-      lhs_tile_num_ *= Strategy::out_height();
-    }
-  }
-
-  void operator()(const float alpha, const Itype *A, const int lda,
-                  const Itype *B, const int ldb, const float beta, Otype *C,
-                  const int ldc) {
-    //  struct timeval tv_begin, tv_end;
-    //  gettimeofday(&tv_begin,NULL);
-    if (M_ > N_) {
-      nblock = CeilDiv(N_, Strategy::out_width()) * Strategy::out_width();
-      lhs_worksize_ = sizeof(Itype) * lhs_tile_num_ * K_ * num_threads_;
-      rhs_worksize_ = sizeof(Itype) * K_ * nblock;
-      out_worksize_ = sizeof(Otype) * lhs_tile_num_ * nblock * num_threads_;
-      ldc_ = nblock;
-    } else {
-      mblock = CeilDiv(M_, Strategy::out_height()) * Strategy::out_height();
-      lhs_worksize_ = sizeof(Itype) * mblock * K_;
-      rhs_worksize_ = sizeof(Itype) * K_ * rhs_tile_num_ * num_threads_;
-      out_worksize_ = sizeof(Otype) * mblock * rhs_tile_num_ * num_threads_;
-      ldc_ = rhs_tile_num_;
-    }
-
-    lhs_workspace_ =
-        static_cast<Itype *>(paddle_mobile::memory::Alloc(lhs_worksize_));
-    rhs_workspace_ =
-        static_cast<Itype *>(paddle_mobile::memory::Alloc(rhs_worksize_));
-    out_workspace_ =
-        static_cast<Otype *>(paddle_mobile::memory::Alloc(out_worksize_));
-
-    //  std::cout << "M: " << M_ << ", N: " << N_ << ", K: " << K_ << std::endl;
-    //  std::cout << "lhs_block: " << CeilDiv(M_, lhs_tile_num_) << ", "
-    //            << "rhs_block: " << CeilDiv(N_, rhs_tile_num_) << std::endl;
-
-    if (M_ > N_) {
-      strategy_.pack_rhs(K_, N_, B, ldb, rhs_workspace_, true);
-
-      #pragma omp parallel for
-      for (int lhs_block = 0; lhs_block < M_; lhs_block += lhs_tile_num_) {
-        int lhs_range = std::min(M_ - lhs_block, lhs_tile_num_);
-#ifdef _OPENMP
-        int thread_id = omp_get_thread_num();
-#else
-        int thread_id = 0;
-#endif
-        float *local_A = lhs_workspace_ + lhs_tile_num_ * K_ * thread_id;
-        float *local_C = out_workspace_ + lhs_tile_num_ * ldc_ * thread_id;
-        // load lhs into lhs_workspace
-        strategy_.pack_lhs(lhs_range, K_, A + lhs_block * lda, lda, local_A,
-                           false);
-        for (int rhs_block = 0; rhs_block < N_; rhs_block += rhs_tile_num_) {
-          int rhs_range = std::min(N_ - rhs_block, rhs_tile_num_);
-          float *local_B = rhs_workspace_ + K_ * rhs_block;
-          for (int rhs_tile = 0; rhs_tile < rhs_range;
-               rhs_tile += Strategy::out_width()) {
-            for (int lhs_tile = 0; lhs_tile < lhs_range;
-                 lhs_tile += Strategy::out_height()) {
-              int offset = lhs_tile * ldc_ + rhs_block + rhs_tile;
-              strategy_.kernel(local_A + lhs_tile * K_, local_B + rhs_tile * K_,
-                               K_, local_C + offset, ldc_);
-            }
-          }
-        }
-        strategy_.write(lhs_range, N_, alpha, local_C, ldc_, beta,
-                        C + lhs_block * ldc, ldc);
-      }
-    } else {
-      strategy_.pack_lhs(M_, K_, A, lda, lhs_workspace_, true);
-
-      #pragma omp parallel for
-      for (int rhs_block = 0; rhs_block < N_; rhs_block += rhs_tile_num_) {
-        int rhs_range = std::min(N_ - rhs_block, rhs_tile_num_);
-#ifdef _OPENMP
-        int thread_id = omp_get_thread_num();
-#else
-        int thread_id = 0;
-#endif
-        float *local_B = rhs_workspace_ + K_ * rhs_tile_num_ * thread_id;
-        float *local_C = out_workspace_ + mblock * ldc_ * thread_id;
-        // load rhs into rhs_workspace
-        strategy_.pack_rhs(K_, rhs_range, B + rhs_block, ldb, local_B, false);
-        for (int lhs_block = 0; lhs_block < M_; lhs_block += lhs_tile_num_) {
-          int lhs_range = std::min(M_ - lhs_block, lhs_tile_num_);
-          float *local_A = lhs_workspace_ + lhs_block * K_;
-          for (int lhs_tile = 0; lhs_tile < lhs_range;
-               lhs_tile += Strategy::out_height()) {
-            for (int rhs_tile = 0; rhs_tile < rhs_range;
-                 rhs_tile += Strategy::out_width()) {
-              int offset = (lhs_block + lhs_tile) * ldc_ + rhs_tile;
-              strategy_.kernel(local_A + lhs_tile * K_, local_B + rhs_tile * K_,
-                               K_, local_C + offset, ldc_);
-            }
-          }
-        }
-        strategy_.write(M_, rhs_range, alpha, local_C, ldc_, beta,
-                        C + rhs_block, ldc);
-      }
-    }
-
-    paddle_mobile::memory::Free(lhs_workspace_);
-    paddle_mobile::memory::Free(rhs_workspace_);
-    paddle_mobile::memory::Free(out_workspace_);
-
-    //  gettimeofday(&tv_end,NULL);
-    //  float elapsed = (tv_end.tv_sec - tv_begin.tv_sec) * 1000.f +
-    //                  (tv_end.tv_usec - tv_begin.tv_usec) / 1000.f;
-    //  std::cout << "elapsed: " << elapsed << "ms, speed: "
-    //            << (M_ * N_ * K_ / 1000.f / 1000.f) / elapsed
-    //            << " gflops" << std::endl;
-  }
-
-  virtual ~GemmExecutor() {}
-
- private:
-  const unsigned int M_;
-  const unsigned int N_;
-  const unsigned int K_;
-  const bool transA_;
-  const bool transB_;
-
-  unsigned int lhs_tile_num_ = 0;
-  unsigned int rhs_tile_num_ = 0;
-  unsigned int out_tile_num_ = 0;
-
-  unsigned int lhs_worksize_ = 0;
-  unsigned int rhs_worksize_ = 0;
-  unsigned int out_worksize_ = 0;
-  unsigned int ldc_ = 0;
-
-  unsigned int mblock = 0;
-  unsigned int nblock = 0;
-
-  Itype *lhs_workspace_ = nullptr;
-  Itype *rhs_workspace_ = nullptr;
-  Otype *out_workspace_ = nullptr;
-
-  Strategy strategy_;
-};
-
-template <typename Strategy>
-class GemvExecutor : public Executor {
-  typedef typename Strategy::Itype Itype;
-  typedef typename Strategy::Otype Otype;
-
- public:
-  GemvExecutor(const bool transA, const int M, const int N)
-      : Executor(), M_(M), N_(N), trans_(transA) {}
-
-  void operator()(const float alpha, const Itype *A, const int lda,
-                  const Itype *B, const float beta, Otype *C) {
-    strategy_.kernel(trans_, M_, N_, alpha, A, lda, B, beta, C);
-  }
-
-  virtual ~GemvExecutor() {}
-
- private:
-  const unsigned int M_;
-  const unsigned int N_;
-  const bool trans_;
-
-  Strategy strategy_;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm/gemm1x1s1.cpp b/mobile/src/operators/math/gemm/gemm1x1s1.cpp
deleted file mode 100644
index 2fd78fa18923248a9a8b12d3ea8bef444b664733..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/gemm1x1s1.cpp
+++ /dev/null
@@ -1,2223 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifdef CONV_OP
-
-#include "operators/math/gemm/gemm1x1s1.h"
-#include <arm_neon.h>
-#include "framework/context.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifdef __aarch64__
-void prepackA_8x12(float *out, const float *in, const int ldin, const int m0,
-                   const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t *dout = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-  int stride = x_len * 8;
-
-#pragma omp parallel for
-  for (int y = m0; y < mmax; y += 8) {
-    uint32_t *outptr = dout + stride * (y - m0) / 8;
-
-    const uint32_t *inptr0 = inptr + y * ldin + k0;
-    const uint32_t *inptr1 = inptr0 + ldin;
-    const uint32_t *inptr2 = inptr1 + ldin;
-    const uint32_t *inptr3 = inptr2 + ldin;
-    const uint32_t *inptr4 = inptr3 + ldin;
-    const uint32_t *inptr5 = inptr4 + ldin;
-    const uint32_t *inptr6 = inptr5 + ldin;
-    const uint32_t *inptr7 = inptr6 + ldin;
-
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        "prfm   pldl1keep, [%[ptr4]]        \n"
-        "prfm   pldl1keep, [%[ptr4], #64]   \n"
-        "prfm   pldl1keep, [%[ptr5]]        \n"
-        "prfm   pldl1keep, [%[ptr5], #64]   \n"
-        "prfm   pldl1keep, [%[ptr6]]        \n"
-        "prfm   pldl1keep, [%[ptr6], #64]   \n"
-        "prfm   pldl1keep, [%[ptr7]]        \n"
-        "prfm   pldl1keep, [%[ptr7], #64]   \n"
-        :
-        : [ptr0] "r"(inptr0), [ptr1] "r"(inptr1), [ptr2] "r"(inptr2),
-          [ptr3] "r"(inptr3), [ptr4] "r"(inptr4), [ptr5] "r"(inptr5),
-          [ptr6] "r"(inptr6), [ptr7] "r"(inptr7)
-        : "memory");
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 7) >= mmax) {
-      switch ((y + 7) - mmax) {
-        case 6:
-          inptr1 = zerobuff;
-        case 5:
-          inptr2 = zerobuff;
-        case 4:
-          inptr3 = zerobuff;
-        case 3:
-          inptr4 = zerobuff;
-        case 2:
-          inptr5 = zerobuff;
-        case 1:
-          inptr6 = zerobuff;
-        case 0:
-          inptr7 = zerobuff;
-        default:
-          break;
-      }
-    }
-    for (; x > 7; x -= 8) {
-      asm volatile(
-          // Load up 8 elements (2 vectors) from each of 8 sources.
-          "LDP        q0, q1, [%[inptr0]], #32\n"  // q0=A0A1A2A3
-          "LDP        q2, q3, [%[inptr1]], #32\n"  // q2=B0B1B2B3
-          "LDP        q4, q5, [%[inptr2]], #32\n"  // q4=C0C1C2C3
-          "ZIP1       v16.4s, v0.4s, v4.4s\n"      // q16=A0C0A1C1
-          "prfm   pldl1keep, [%[inptr0], #128] \n"
-          "LDP        q6, q7, [%[inptr3]], #32\n"  // q6=D0D1D2D3
-          "ZIP1       v17.4s, v2.4s, v6.4s\n"      // q17=B0D0B1D1
-          "LDP        q8, q9, [%[inptr4]], #32\n"
-          "LDP        q10, q11, [%[inptr5]], #32\n"
-          "LDP        q12, q13, [%[inptr6]], #32\n"
-          "ZIP1       v18.4s, v8.4s, v12.4s\n"
-          "prfm   pldl1keep, [%[inptr1], #128]\n"
-          "LDP        q14, q15, [%[inptr7]], #32\n"
-          "ZIP1       v19.4s, v10.4s, v14.4s\n"
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"  // q20=A0B0C0D0
-          "prfm   pldl1keep, [%[inptr2], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v0.4s, v4.4s\n"
-          "prfm   pldl1keep, [%[inptr3], #128]\n"
-          "ZIP2       v17.4s, v2.4s, v6.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Write back the first
-                                                     // element of each source
-
-          "ZIP2       v18.4s, v8.4s, v12.4s\n"
-          "ZIP2       v19.4s, v10.4s, v14.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Write back the second
-                                                     // element of each source
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr4], #128]\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP1       v16.4s, v1.4s, v5.4s\n"
-          "prfm   pldl1keep, [%[inptr5], #128]\n"
-          "ZIP1       v17.4s, v3.4s, v7.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Third element
-
-          "ZIP1       v18.4s, v9.4s, v13.4s\n"
-          "ZIP1       v19.4s, v11.4s, v15.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Fourth element
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "prfm   pldl1keep, [%[inptr6], #128]\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-
-          "ZIP2       v16.4s, v1.4s, v5.4s\n"
-          "ZIP2       v17.4s, v3.4s, v7.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Fifth element
-
-          "ZIP2       v18.4s, v9.4s, v13.4s\n"
-          "prfm   pldl1keep, [%[inptr7], #128]\n"
-          "ZIP2       v19.4s, v11.4s, v15.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Sixth element
-
-          "ZIP1       v20.4s, v16.4s, v17.4s\n"
-          "ZIP1       v21.4s, v18.4s, v19.4s\n"
-          "STP        q20, q21, [%[outptr]], #32\n"  // Seventh element
-
-          "ZIP2       v22.4s, v16.4s, v17.4s\n"
-          "ZIP2       v23.4s, v18.4s, v19.4s\n"
-          "STP        q22, q23, [%[outptr]], #32\n"  // Eighth element
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [inptr6] "+r"(inptr6), [inptr7] "+r"(inptr7), [outptr] "+r"(outptr)
-          :
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-            "v20", "v21", "v22", "v23", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-      *outptr++ = *inptr6++;
-      *outptr++ = *inptr7++;
-    }
-  }
-}
-
-#else   //__aarch64__
-void prepackA_6x8(float* out, const float* in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* dout = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-  uint32_t* outptr = dout;
-
-  //! data A is not transposed, transpose A to k * 6
-  for (int y = m0; y < mmax; y += 6) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-    const uint32_t* inptr4 = inptr3 + ldin;
-    const uint32_t* inptr5 = inptr4 + ldin;
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 5) >= mmax) {
-      switch ((y + 5) - mmax) {
-        case 4:
-          inptr1 = zerobuff;
-        case 3:
-          inptr2 = zerobuff;
-        case 2:
-          inptr3 = zerobuff;
-        case 1:
-          inptr4 = zerobuff;
-        case 0:
-          inptr5 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 6 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-          "vld4.32  {d16-d19}, [%[inptr4]]! @ zip load r4, "
-          "q8,q9=r40,r44,r41,r45,r42,r46,r43,r47\n"
-          "vld4.32  {d20-d23}, [%[inptr5]]! @ zip load r5, "
-          "q10,q11=r50,r54,r51,r55,r52,r56,r53,r57\n"
-
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-          "vtrn.32  q8, q10                 @ trans data: q8=r40,r50,r41,r51; "
-          "q10=r44,r54,r45,r55\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d16},    [%[outptr]]!  @ write d16(q8,low),r40,r50\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-          "vst1.32  {d17},    [%[outptr]]!  @ write d16(q8,high),r41,r51\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-          "vtrn.32  q9, q11                 @ trans data: q9=r42,r52,r43,r53; "
-          "q11=r46,r56,r47,r57\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d18},    [%[outptr]]!  @ write d18(q9,low),r42,r52\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-          "vst1.32  {d19},    [%[outptr]]!  @ write d19(q9,high),r43,r53\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d20},    [%[outptr]]!  @ write d20(q10,low),r44,r54\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-          "vst1.32  {d21},    [%[outptr]]!  @ write d21(q10,high),r45,r55\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d22},    [%[outptr]]!  @ write d22(q11,low),r46,r56\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          "vst1.32  {d23},    [%[outptr]]!  @ write d23(q11,high),r47,r57\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [inptr4] "+r"(inptr4), [inptr5] "+r"(inptr5),
-            [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-      *outptr++ = *inptr4++;
-      *outptr++ = *inptr5++;
-    }
-  }
-}
-
-void prepackA_4x8(float* out, const float* in, const int ldin, const int m0,
-                  const int mmax, const int k0, const int kmax) {
-  int x_len = kmax - k0;
-  uint32_t zerobuff[x_len];
-  memset(zerobuff, 0, sizeof(uint32_t) * x_len);
-
-  uint32_t* dout = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr = reinterpret_cast<const uint32_t*>(in);
-
-  uint32_t* outptr = dout;
-  //! data A is not transposed, transpose A to k * 4
-  for (int y = m0; y < mmax; y += 4) {
-    const uint32_t* inptr0 = inptr + y * ldin + k0;
-    const uint32_t* inptr1 = inptr0 + ldin;
-    const uint32_t* inptr2 = inptr1 + ldin;
-    const uint32_t* inptr3 = inptr2 + ldin;
-
-    int x = x_len;
-    //! cope with row index exceed real size, set to zero buffer
-    if ((y + 3) >= mmax) {
-      switch ((y + 3) - mmax) {
-        case 2:
-          inptr1 = zerobuff;
-        case 1:
-          inptr2 = zerobuff;
-        case 0:
-          inptr3 = zerobuff;
-        default:
-          break;
-      }
-    }
-
-    for (; x > 7; x -= 8) {
-      //! zip load 8 elements (2 neon Q registers) from each of 4 rows
-      asm volatile(
-          "vld4.32  {d0-d3}, [%[inptr0]]!   @ zip load r0, "
-          "q0,q1=r00,r04,r01,r05,r02,r06,r03,r07\n"
-          "vld4.32  {d4-d7}, [%[inptr1]]!   @ zip load r1, "
-          "q2,q3=r10,r14,r11,r15,r12,r16,r13,r17\n"
-          "vld4.32  {d8-d11}, [%[inptr2]]!  @ zip load r2, "
-          "q4,q5=r20,r24,r21,r25,r22,r26,r23,r27\n"
-          "vld4.32  {d12-d15}, [%[inptr3]]! @ zip load r3, "
-          "q6,q7=r30,r34,r31,r35,r32,r36,r33,r37\n"
-
-          "vtrn.32  q0, q2                  @ trans data: q0=r00,r10,r01,r11; "
-          "q2=r04,r14,r05,r15\n"
-          "vtrn.32  q4, q6                  @ trans data: q4=r20,r30,r21,r31; "
-          "q6=r24,r34,r25,r35\n"
-
-          "vswp     d1, d8                  @ swap d1, d8, q0=r00,r10,r20,r30; "
-          "q4=r01,r11,r21,r31\n"
-          "vst1.32  {d0-d1},  [%[outptr]]!  @ write q0:r00,r10,r20,r30\n"
-          "vst1.32  {d8-d9},  [%[outptr]]!  @ write q4:r01,r11,r21,r31\n"
-
-          "vtrn.32  q1, q3                  @ trans data: q1=r02,r12,r03,r13; "
-          "q3=r06,r16,r07,r17\n"
-          "vtrn.32  q5, q7                  @ trans data: q5=r22,r32,r23,r33; "
-          "q7=r26,r36,r27,r37\n"
-
-          "vswp     d3, d10                 @ swap d3, d10, "
-          "q1=r02,r12,r22,r32; q5=r03,r13,r23,r33\n"
-          "vst1.32  {d2-d3},  [%[outptr]]!  @ write q1:r02,r12,r22,r32\n"
-          "vst1.32  {d10-d11},[%[outptr]]!  @ write q5:r03,r13,r23,r33\n"
-
-          "vswp     d5, d12                 @ swap d5, d12,q2=r04,r14,r24,r34; "
-          "q6=r05,r15,r25,r35\n"
-          "vst1.32  {d4-d5},  [%[outptr]]!  @ write q2:r04,r14,r24,r34\n"
-          "vst1.32  {d12-d13},[%[outptr]]!  @ write q6:r05,r15,r25,r35\n"
-
-          "vswp     d7, d14                 @ swap d7, d14, "
-          "q3=r06,r16,r26,r36; q7=r07,r17,r27,r37\n"
-          "vst1.32  {d6-d7},  [%[outptr]]!  @ write q3:r06,r16,r26,r36\n"
-          "vst1.32  {d14-d15},[%[outptr]]!  @ write q7:r07,r17,r27,r37\n"
-          : [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1), [inptr2] "+r"(inptr2),
-            [inptr3] "+r"(inptr3), [outptr] "+r"(outptr)
-          :
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11", "cc", "memory");
-    }
-
-    for (; x > 0; x--) {
-      *outptr++ = *inptr0++;
-      *outptr++ = *inptr1++;
-      *outptr++ = *inptr2++;
-      *outptr++ = *inptr3++;
-    }
-  }
-}
-#endif  //__aarch64__
-
-void prepackA(float *out, const float *in, const int ldin, const int m0,
-              const int mmax, const int k0, const int kmax, bool is_trans,
-              ARMArch arch) {
-#ifdef __aarch64__
-  if (!is_trans) {
-    prepackA_8x12(out, in, ldin, m0, mmax, k0, kmax);
-  }
-#else
-  if (arch == A73) {
-    if (!is_trans) {
-      prepackA_4x8(out, in, ldin, m0, mmax, k0, kmax);
-    }
-  } else {
-    if (!is_trans) {
-      prepackA_6x8(out, in, ldin, m0, mmax, k0, kmax);
-    }
-  }
-#endif
-}
-
-void gemm1x1s1_transform_weight(const framework::Tensor &weight,
-                                const framework::Tensor &output,
-                                framework::Tensor *trans_weight,
-                                const int group, ARMArch arch) {
-  const int chout = weight.dims()[0];
-  const int chin = weight.dims()[1];
-  const int hout = output.dims()[2];
-  const int wout = output.dims()[3];
-  const int m = chout / group;
-  const int n = hout * wout;
-  const int k = chin / group;
-
-  if (n > 1) {
-    int hblock = get_hblock(arch);
-    int m_roundup = hblock * ((m + hblock - 1) / hblock);
-    int weights_size_per_group = ((m_roundup * k + 15) / 16) * 16;
-    int weight_worksize = sizeof(float) * weights_size_per_group * group;
-    float *w_trans_ptr = trans_weight->mutable_data<float>({weight_worksize});
-    for (int g = 0; g < group; ++g) {
-      const float *weights_group = weight.data<float>() + g * m * k;
-      float *weights_trans_ptr = w_trans_ptr + g * weights_size_per_group;
-      prepackA(weights_trans_ptr, weights_group, k, 0, m, 0, k, false, arch);
-    }
-  }
-}
-
-#ifdef __aarch64__
-void loadb(float *out, const float *in, const int ldin, const int k0,
-           const int kmax, const int n0, const int nmax) {
-  uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
-  const uint32_t *inptr =
-      reinterpret_cast<const uint32_t *>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 12 * (x_len / 12);
-  int right_pad = 12 - right_remain;
-  const size_t copy_len_remain = sizeof(float) * right_remain;
-  const size_t copy_len_pad = sizeof(float) * right_pad;
-  const size_t size_ldin = sizeof(float) * ldin;
-
-  uint32_t *outptr_row = outptr;
-  int stride_out = 12 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-  uint32x4_t vmask3 =
-      vcltq_u32(vld1q_u32(mask_buffer + 8), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    const uint32_t *ptr1 = ptr0 + ldin;
-    const uint32_t *ptr2 = ptr1 + ldin;
-    const uint32_t *ptr3 = ptr2 + ldin;
-    asm volatile(
-        "prfm   pldl1keep, [%[ptr0]]                \n"
-        "prfm   pldl1keep, [%[ptr0], #64]   \n"
-        "prfm   pldl1keep, [%[ptr1]]        \n"
-        "prfm   pldl1keep, [%[ptr1], #64]   \n"
-        "prfm   pldl1keep, [%[ptr2]]        \n"
-        "prfm   pldl1keep, [%[ptr2], #64]   \n"
-        "prfm   pldl1keep, [%[ptr3]]        \n"
-        "prfm   pldl1keep, [%[ptr3], #64]   \n"
-        :
-        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
-        : "memory");
-
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      vst1q_u32(outptr_row_col, vr00);
-      vst1q_u32(outptr_row_col + 4, vr01);
-      vst1q_u32(outptr_row_col + 8, vr02);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10);
-      vst1q_u32(outptr_row_col + 16, vr11);
-      vst1q_u32(outptr_row_col + 20, vr12);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 24, vr20);
-      vst1q_u32(outptr_row_col + 28, vr21);
-      vst1q_u32(outptr_row_col + 32, vr22);
-
-      vst1q_u32(outptr_row_col + 36, vr30);
-      vst1q_u32(outptr_row_col + 40, vr31);
-      vst1q_u32(outptr_row_col + 44, vr32);
-
-      ptr0 += 12;
-      ptr1 += 12;
-      ptr2 += 12;
-      ptr3 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr00 = vld1q_u32(ptr0);
-      uint32x4_t vr01 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr02 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr10 = vld1q_u32(ptr1);
-      uint32x4_t vr11 = vld1q_u32(ptr1 + 4);
-      uint32x4_t vr12 = vld1q_u32(ptr1 + 8);
-
-      uint32x4_t vr00_1 = vbslq_u32(vmask1, vr00, vzero);
-      uint32x4_t vr01_1 = vbslq_u32(vmask2, vr01, vzero);
-      uint32x4_t vr02_1 = vbslq_u32(vmask3, vr02, vzero);
-
-      uint32x4_t vr20 = vld1q_u32(ptr2);
-      uint32x4_t vr21 = vld1q_u32(ptr2 + 4);
-      uint32x4_t vr22 = vld1q_u32(ptr2 + 8);
-
-      vst1q_u32(outptr_row_col, vr00_1);
-      vst1q_u32(outptr_row_col + 4, vr01_1);
-      vst1q_u32(outptr_row_col + 8, vr02_1);
-
-      uint32x4_t vr10_1 = vbslq_u32(vmask1, vr10, vzero);
-      uint32x4_t vr11_1 = vbslq_u32(vmask2, vr11, vzero);
-      uint32x4_t vr12_1 = vbslq_u32(vmask3, vr12, vzero);
-
-      uint32x4_t vr30 = vld1q_u32(ptr3);
-      uint32x4_t vr31 = vld1q_u32(ptr3 + 4);
-      uint32x4_t vr32 = vld1q_u32(ptr3 + 8);
-
-      vst1q_u32(outptr_row_col + 12, vr10_1);
-      vst1q_u32(outptr_row_col + 16, vr11_1);
-      vst1q_u32(outptr_row_col + 20, vr12_1);
-
-      uint32x4_t vr20_1 = vbslq_u32(vmask1, vr20, vzero);
-      uint32x4_t vr21_1 = vbslq_u32(vmask2, vr21, vzero);
-      uint32x4_t vr22_1 = vbslq_u32(vmask3, vr22, vzero);
-
-      uint32x4_t vr30_1 = vbslq_u32(vmask1, vr30, vzero);
-      uint32x4_t vr31_1 = vbslq_u32(vmask2, vr31, vzero);
-      uint32x4_t vr32_1 = vbslq_u32(vmask3, vr32, vzero);
-
-      vst1q_u32(outptr_row_col + 24, vr20_1);
-      vst1q_u32(outptr_row_col + 28, vr21_1);
-      vst1q_u32(outptr_row_col + 32, vr22_1);
-
-      vst1q_u32(outptr_row_col + 36, vr30_1);
-      vst1q_u32(outptr_row_col + 40, vr31_1);
-      vst1q_u32(outptr_row_col + 44, vr32_1);
-    }
-  }
-
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t *ptr0 = inptr + y * ldin;
-    uint32_t *outptr_row_col = outptr_row + y * 12;
-
-    int i = 0;
-    for (; i < x_len - 11; i += 12) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-      vst1q_u32(outptr_row_col, vr0);
-      vst1q_u32(outptr_row_col + 4, vr1);
-      vst1q_u32(outptr_row_col + 8, vr2);
-
-      ptr0 += 12;
-
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32x4_t vr0 = vld1q_u32(ptr0);
-      uint32x4_t vr1 = vld1q_u32(ptr0 + 4);
-      uint32x4_t vr2 = vld1q_u32(ptr0 + 8);
-
-      uint32x4_t vr0_1 = vbslq_u32(vmask1, vr0, vzero);
-      uint32x4_t vr1_1 = vbslq_u32(vmask2, vr1, vzero);
-      uint32x4_t vr2_1 = vbslq_u32(vmask3, vr2, vzero);
-
-      vst1q_u32(outptr_row_col, vr0_1);
-      vst1q_u32(outptr_row_col + 4, vr1_1);
-      vst1q_u32(outptr_row_col + 8, vr2_1);
-    }
-  }
-}
-#else  //__aarch64__
-void loadb(float* out, const float* in, const int ldin, const int k0,
-           const int kmax, const int n0, const int nmax) {
-  uint32_t* outptr = reinterpret_cast<uint32_t*>(out);
-  const uint32_t* inptr =
-      reinterpret_cast<const uint32_t*>(in) + k0 * ldin + n0;
-  uint32_t mask_buffer[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  int x_len = nmax - n0;
-  int y_len = kmax - k0;
-  int right_remain = x_len - 8 * (x_len / 8);
-  int right_pad = 8 - right_remain;
-  const size_t copy_len_remain = sizeof(float) * right_remain;
-  const size_t copy_len_pad = sizeof(float) * right_pad;
-  const size_t size_ldin = sizeof(float) * ldin;
-
-  uint32_t* outptr_row = outptr;
-  int stride_out = 8 * y_len;
-
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 =
-      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
-  uint32x4_t vmask2 =
-      vcltq_u32(vld1q_u32(mask_buffer + 4), vdupq_n_u32(right_remain));
-
-#pragma omp parallel for
-  for (int y = 0; y < y_len - 3; y += 4) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    const uint32_t* ptr1 = ptr0 + ldin;
-    const uint32_t* ptr2 = ptr1 + ldin;
-    const uint32_t* ptr3 = ptr2 + ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          :
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr1]]!        @ load r1, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-
-          "vld1.32 {d0-d3}, [%[ptr2]]!        @ load r2, 8 elements\n"
-          "vld1.32 {d4-d7}, [%[ptr3]]!        @ load r3, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          //"vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vbif   q2, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q3, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          "vst1.32 {d4-d7}, [%[outptr]]!      @ write to output ptr\n"
-          : [outptr] "+r"(ptr_out), [ptr0] "+r"(ptr0), [ptr1] "+r"(ptr1),
-            [ptr2] "+r"(ptr2), [ptr3] "+r"(ptr3)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "cc", "memory");
-    }
-  }
-#pragma omp parallel for
-  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
-    const uint32_t* ptr0 = inptr + y * ldin;
-    uint32_t* outptr_row_col = outptr_row + y * 8;
-    int i = 0;
-    for (; i < x_len - 7; i += 8) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          :
-          : "q0", "q1", "cc", "memory");
-      outptr_row_col += stride_out;
-    }
-    if (right_remain > 0) {
-      uint32_t* ptr_out = outptr_row_col;
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[ptr0]]!        @ load r0, 8 elements\n"
-          "vbif   q0, %q[vzero], %q[vmask1]   @ bit select, pad zero\n"
-          "vbif   q1, %q[vzero], %q[vmask2]   @ bit select, pad zero\n"
-          "vst1.32 {d0-d3}, [%[outptr]]!      @ write to output ptr\n"
-          : [ptr0] "+r"(ptr0), [outptr] "+r"(ptr_out)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [vzero] "w"(vzero)
-          : "q0", "q1", "cc", "memory");
-    }
-  }
-}
-#endif  //__aarch64__
-
-#ifdef __aarch64__
-void sgemm_conv_8x12(const float *A_packed, const float *B, const float *bias,
-                     float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                     bool transB) {
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  int l2_cache = l2_size > 0 ? l2_size : 512 * 1024;
-
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block = (l2_cache - (MBLOCK * K)) / (sizeof(float) * (K + MBLOCK));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  // unroll 2 loop
-  int tail_pre = (K & (KBLOCK - 1));
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float *b_pannel =
-        static_cast<float *>(framework::CPUContext::Context()->get_work_space(
-            K * (xmax - x0) * sizeof(float)));
-    if (!transB) {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK) {
-      unsigned int ymax = y + MBLOCK;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float bias_local[8] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-        bias_local[6] = bias[y + 6];
-        bias_local[7] = bias[y + 7];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-      float cout6[NBLOCK];
-      float cout7[NBLOCK];
-
-      float *c_ptr0 = C + y * N + x0;
-      float *c_ptr1 = c_ptr0 + N;
-      float *c_ptr2 = c_ptr1 + N;
-      float *c_ptr3 = c_ptr2 + N;
-      float *c_ptr4 = c_ptr3 + N;
-      float *c_ptr5 = c_ptr4 + N;
-      float *c_ptr6 = c_ptr5 + N;
-      float *c_ptr7 = c_ptr6 + N;
-
-      float *pout0 = c_ptr0;
-      float *pout1 = c_ptr1;
-      float *pout2 = c_ptr2;
-      float *pout3 = c_ptr3;
-      float *pout4 = c_ptr4;
-      float *pout5 = c_ptr5;
-      float *pout6 = c_ptr6;
-      float *pout7 = c_ptr7;
-
-      const float *a_ptr_l = A_packed + y * K;
-      const float *b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 7) >= ymax) {
-          switch ((y + 7) - ymax) {
-            case 6:
-              c_ptr1 = cout1;
-            case 5:
-              c_ptr2 = cout2;
-            case 4:
-              c_ptr3 = cout3;
-            case 3:
-              c_ptr4 = cout4;
-            case 2:
-              c_ptr5 = cout5;
-            case 1:
-              c_ptr6 = cout6;
-            case 0:
-              c_ptr7 = cout7;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-          pout6 = c_ptr6;
-          pout7 = c_ptr7;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-          c_ptr6 = cout6;
-          c_ptr7 = cout7;
-        }
-        const float *a_ptr = a_ptr_l;
-        int tail = tail_pre;
-        int k = k_pre;
-
-        asm volatile(
-            // Initialize result registers, load initial operands, prime
-            // prefetches.
-            "ldp    q2, q3, [%[bias_ptr]]\n"       /* load bias to q2, q3*/
-            "ldp    q0, q1, [%[a_ptr]], #32\n"     /* load a00,a01 to q0, q1*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"     /* load b0, b1 to q4, q5*/
-            "dup    v8.4s,  v2.s[0]\n"             /* out0 = 0 */
-            "dup    v9.4s,  v2.s[0]\n"             /* out1 = 0*/
-            "dup    v10.4s, v2.s[0]\n"             /* out2 = 0*/
-            "dup    v11.4s, v2.s[1]\n"             /* out3 = 0*/
-            "dup    v12.4s, v2.s[1]\n"             /* out4 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
-            "dup    v13.4s, v2.s[1]\n"             /* out5 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
-            "dup    v14.4s, v2.s[2]\n"             /* out6 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #128]\n" /* preload b*/
-            "dup    v15.4s, v2.s[2]\n"             /* out7 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #128]\n" /* preload a*/
-            "dup    v16.4s, v2.s[2]\n"             /* out8 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #192]\n" /* preload b*/
-            "dup    v17.4s, v2.s[3]\n"             /* out9 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #256]\n" /* preload b*/
-            "dup    v18.4s, v2.s[3]\n"             /* out10 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #192]\n" /* preload a*/
-            "dup    v19.4s, v2.s[3]\n"             /* out11 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #320]\n" /* preload b*/
-            "dup    v20.4s, v3.s[0]\n"             /* out12 = 0*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n" /* preload a*/
-            "dup    v21.4s, v3.s[0]\n"             /* out13 = 0*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n" /* preload b*/
-            "dup    v22.4s, v3.s[0]\n"             /* out14 = 0*/
-            "dup    v23.4s, v3.s[1]\n"             /* out15 = 0*/
-            "dup    v24.4s, v3.s[1]\n"             /* out16 = 0*/
-            "dup    v25.4s, v3.s[1]\n"             /* out17 = 0*/
-            "dup    v26.4s, v3.s[2]\n"             /* out18 = 0*/
-            "dup    v27.4s, v3.s[2]\n"             /* out19 = 0*/
-            "dup    v28.4s, v3.s[2]\n"             /* out20 = 0*/
-            "dup    v29.4s, v3.s[3]\n"             /* out21 = 0*/
-            "dup    v30.4s, v3.s[3]\n"             /* out22 = 0*/
-            "dup    v31.4s, v3.s[3]\n"             /* out23 = 0*/
-            "cbz    %w[k], 2f\n"                   /* check loop count > 0 */
-            /* main loop */
-            /* unrool 0*/
-            "1:\n"                               /* main loop */
-            "fmla   v8.4s ,  v4.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 = q4
-                                                  */
-            "fmla   v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 = q4
-                                                  */
-            "ldp    q6, q7, [%[b_ptr]], #32\n" /* load b2, b0 to q6, q7       */
-            "fmla   v14.4s,  v4.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q4
-                                                 */
-            "fmla   v17.4s,  v4.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q4
-                                                 */
-            "ldp    q2, q3, [%[a_ptr]], #32\n" /* load a10, a11 to q3, q4     */
-            "fmla   v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q4
-                                                 */
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q4
-                                                 */
-            "fmla   v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q4
-                                                 */
-            "fmla   v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q4
-                                                 */
-
-            "fmla   v9.4s,  v5.4s,  v0.s[0]\n" /* out8 = b1 * a00[0], b1 = q5 */
-            "fmla   v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q5
-                                                 */
-            "fmla   v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q5*/
-            "fmla   v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q5*/
-            "fmla   v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q5*/
-            "fmla   v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q5*/
-            "fmla   v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q5*/
-            "fmla   v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q5*/
-
-            "ldp    q4, q5, [%[b_ptr]], #32\n" /* load b1, b2 to q4, q5       */
-
-            "fmla   v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla   v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q6*/
-            "fmla   v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q6*/
-            "fmla   v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q6*/
-
-            "ldp    q0, q1, [%[a_ptr]], #32\n" /* load a00, a01 to q0, q1     */
-
-            /* unrool 1 */
-            "fmla   v8.4s ,  v7.4s,  v2.s[0]\n"  /* out0 = b0 * a10[0], b0 = q7
-                                                  */
-            "fmla   v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 = q7
-                                                  */
-            "fmla   v14.4s,  v7.4s,  v2.s[2]\n"  /* out2 = b0 * a10[2], b0 = q7
-                                                  */
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla   v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q7
-                                                 */
-            "fmla   v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q7
-                                                 */
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7
-                                                 */
-            "fmla   v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q7
-                                                 */
-            "fmla   v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q7
-                                                 */
-
-            "ldp    q6, q7, [%[b_ptr]], #32\n" /* load b0, b1 to q6, q7       */
-
-            "fmla   v9.4s,  v4.4s,  v2.s[0]\n" /* out8 = b0 * a10[0], b1 = q4 */
-            "fmla   v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q4
-                                                 */
-            "fmla   v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q4*/
-            "fmla   v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q4*/
-            "fmla   v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q4*/
-            "fmla   v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q4*/
-
-            "fmla   v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q5*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n" /* load b2, b0 to q4, q5       */
-            /* unrool 2*/
-            "fmla   v8.4s ,  v6.4s,  v0.s[0]\n"  /* out0 = b0 * a00[0], b0 = q6
-                                                  */
-            "fmla   v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 = q6
-                                                  */
-            "ldp    q2, q3, [%[a_ptr]], #32\n"   /* load a10, a11 to q3, q4*/
-            "fmla   v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q6*/
-            "fmla   v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q6*/
-            "fmla   v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q6*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q6*/
-            "fmla   v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q6*/
-            "fmla   v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q6*/
-            "fmla   v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 = q7*/
-            "fmla   v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q7*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla   v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q7*/
-            "fmla   v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q7*/
-            "fmla   v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q7*/
-            "fmla   v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q7*/
-            "fmla   v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q7*/
-            "fmla   v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q7*/
-
-            "ldp    q6, q7, [%[b_ptr]], #32\n" /* load b1, b2 to q6, q7*/
-
-            "fmla   v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q4*/
-            "fmla   v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q4*/
-            "ldp    q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            /* unrool 3*/
-            "fmla   v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v5.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v5.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v5.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v5.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v5.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"  /* load b0, b1 to q4, q5*/
-            "fmla   v9.4s,  v6.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v6.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q6*/
-            "prfm   pldl1keep, [%[a_ptr], #256]\n"
-            "fmla   v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "prfm   pldl1keep, [%[b_ptr], #384]\n"
-            "fmla   v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "subs   %w[k], %w[k], #1\n"         /* loop count - 1*/
-            "fmla   v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "bne    1b\n"
-            /* Target to use when K is 1 or 2 (i.e. zero iterations of main
-               loop)*/
-            "2:\n"                                /* process tail*/
-            "subs       %w[tail], %w[tail], #1\n" /* tail--*/
-            "beq        3f\n"                     /*jump to tail = 1*/
-            /* final unrool 0*/
-            /* unrool 0, tail > 1*/
-            "fmla   v8.4s ,  v4.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q4*/
-            "fmla   v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                    q4*/
-            "ldp    q6, q7, [%[b_ptr]], #32\n"   /* load b2, b0 to q6, q7*/
-            "fmla   v14.4s,  v4.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q4*/
-            "fmla   v17.4s,  v4.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q4*/
-            "ldp    q2, q3, [%[a_ptr]], #32\n"  /* load a10, a11 to q2, q3*/
-            "fmla   v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q4*/
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q4*/
-            "fmla   v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q4*/
-            "fmla   v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q4*/
-            "subs   %w[tail], %w[tail], #1\n"   /* tail--*/
-            "fmla   v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 = q5*/
-            "fmla   v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q5*/
-            "fmla   v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q5*/
-            "fmla   v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q5*/
-            "fmla   v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q5*/
-            "fmla   v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q5*/
-            "fmla   v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q5*/
-            "fmla   v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q5*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"  /* load b1, b2 to q4, q5*/
-            "fmla   v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q6*/
-            "fmla   v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q6*/
-            "fmla   v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q6*/
-            "fmla   v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q6*/
-            "fmla   v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q6*/
-            "fmla   v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q6*/
-            "beq        4f\n"                   /*jump to tail = 2*/
-            /* unrool 1, tail > 2*/
-            "ldp    q0, q1, [%[a_ptr]], #32\n"  /* load a00, a01 to q0, q1*/
-            "fmla   v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q7*/
-            "fmla   v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q7*/
-            "fmla   v14.4s,  v7.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q7*/
-            "fmla   v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q7*/
-            "fmla   v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q7*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q7*/
-            "fmla   v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q7*/
-            "fmla   v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q7*/
-            "ldp    q6, q7, [%[b_ptr]], #32\n"  /* load b0, b1 to q6, q7*/
-            "fmla   v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q4*/
-            "fmla   v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b0 * a10[1], b1 = q4*/
-            "fmla   v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q4*/
-            "fmla   v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q4*/
-            "fmla   v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q4*/
-            "fmla   v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q4*/
-            "fmla   v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q4*/
-            "subs   %w[tail], %w[tail], #1\n"   /* tail--*/
-            "fmla   v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q5*/
-            "fmla   v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q5*/
-            "beq        5f\n"                   /*jump to tail = 3*/
-            /* unrool 2, tail = 4*/
-            "ldp    q4, q5, [%[b_ptr]], #32\n"  /* load b2, b0 to q4, q5*/
-            "fmla   v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a00[0], b0 = q6*/
-            "fmla   v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a00[1], b0 =
-                                                    q6*/
-            "ldp    q2, q3, [%[a_ptr]], #32\n"   /* load a10, a11 to q3, q4*/
-            "fmla   v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a00[2], b0 = q6*/
-            "fmla   v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a00[3], b0 = q6*/
-            "fmla   v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a01[0], b0 = q6*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a01[1], b0 = q6*/
-            "fmla   v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a01[2], b0 = q6*/
-            "fmla   v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a01[3], b0 = q6*/
-            "fmla   v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b1 * a00[0], b1 = q7*/
-            "fmla   v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a00[1], b1 = q7*/
-            "fmla   v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a00[2], b1 =
-                                                   q7*/
-            "fmla   v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a00[3], b1 =
-                                                   q7*/
-            "fmla   v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a01[0], b1 =
-                                                   q7*/
-            "fmla   v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a01[1], b1 =
-                                                   q7*/
-            "fmla   v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a01[2], b1 =
-                                                   q7*/
-            "fmla   v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a01[3], b1 =
-                                                   q7*/
-            "ldp    q6, q7, [%[b_ptr]], #32\n"  /* load b1, b2 to q6, q7*/
-            "fmla   v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a00[3], b2 =
-                                                   q4*/
-            "fmla   v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a00[0], b2 =
-                                                   q4*/
-            "fmla   v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a00[1], b2 =
-                                                   q4*/
-            "fmla   v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a00[2], b2 =
-                                                   q4*/
-            "fmla   v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a00[3], b2 =
-                                                   q4*/
-            /* unrool 3, tail = 4*/
-            "fmla   v8.4s ,  v5.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v5.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v5.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v5.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v5.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v5.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v5.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v5.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v6.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v6.4s,  v2.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v6.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v6.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v6.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v6.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v6.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v6.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v7.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v7.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v7.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v7.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v7.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v7.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v7.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v7.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "b      11f\n"
-            /* tails==1 final tail*/
-            "3: \n"                             /* tail=1*/
-            "ldr    q6, [%[b_ptr]], #16\n"      /* load b2 to q6*/
-            "fmla   v8.4s ,  v4.4s,  v0.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v4.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v4.4s,  v0.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v4.4s,  v0.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v4.4s,  v1.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v4.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v4.4s,  v1.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v4.4s,  v1.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v5.4s,  v0.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v5.4s,  v0.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v5.4s,  v0.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v5.4s,  v0.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v5.4s,  v1.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v5.4s,  v1.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v5.4s,  v1.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v5.4s,  v1.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v6.4s,  v0.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v6.4s,  v0.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v6.4s,  v0.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v6.4s,  v0.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v6.4s,  v1.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v6.4s,  v1.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v6.4s,  v1.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v6.4s,  v1.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "b      11f\n"
-            /* tails==2 final tail*/
-            "4:\n"                              /* tail = 2*/
-            "fmla   v8.4s ,  v7.4s,  v2.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v7.4s,  v2.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v7.4s,  v2.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v7.4s,  v2.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v7.4s,  v3.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v7.4s,  v3.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v7.4s,  v3.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v7.4s,  v3.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v4.4s,  v2.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v4.4s,  v2.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v4.4s,  v2.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v4.4s,  v2.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v4.4s,  v3.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v4.4s,  v3.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v4.4s,  v3.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v4.4s,  v3.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v5.4s,  v2.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v5.4s,  v2.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v5.4s,  v2.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v5.4s,  v2.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v5.4s,  v3.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v5.4s,  v3.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v5.4s,  v3.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v5.4s,  v3.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "b      11f\n"
-            /* tails==3 final tail*/
-            "5:\n"                              /* tail = 3*/
-            "ldr    q4, [%[b_ptr]], #16\n"      /* load b2, b0 to q4*/
-            "fmla   v8.4s ,  v6.4s,  v0.s[0]\n" /* out0 = b0 * a10[0], b0 = q5*/
-            "fmla   v11.4s ,  v6.4s,  v0.s[1]\n" /* out1 = b0 * a10[1], b0 =
-                                                    q5*/
-            "fmla   v14.4s,  v6.4s,  v0.s[2]\n" /* out2 = b0 * a10[2], b0 = q5*/
-            "fmla   v17.4s,  v6.4s,  v0.s[3]\n" /* out3 = b0 * a10[3], b0 = q5*/
-            "fmla   v20.4s,  v6.4s,  v1.s[0]\n" /* out4 = b0 * a11[0], b0 = q5*/
-            "fmla   v23.4s,  v6.4s,  v1.s[1]\n" /* out5 = b0 * a11[1], b0 = q5*/
-            "fmla   v26.4s,  v6.4s,  v1.s[2]\n" /* out6 = b0 * a11[2], b0 = q5*/
-            "fmla   v29.4s,  v6.4s,  v1.s[3]\n" /* out7 = b0 * a11[3], b0 = q5*/
-            "fmla   v9.4s,  v7.4s,  v0.s[0]\n"  /* out8 = b0 * a10[0], b1 = q6*/
-            "fmla   v12.4s,  v7.4s,  v0.s[1]\n" /* out9 = b1 * a10[1], b1 = q6*/
-            "fmla   v15.4s,  v7.4s,  v0.s[2]\n" /* out10 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v18.4s,  v7.4s,  v0.s[3]\n" /* out11 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v21.4s,  v7.4s,  v1.s[0]\n" /* out12 = b1 * a10[0], b1 =
-                                                   q6*/
-            "fmla   v24.4s,  v7.4s,  v1.s[1]\n" /* out13 = b1 * a10[1], b1 =
-                                                   q6*/
-            "fmla   v27.4s,  v7.4s,  v1.s[2]\n" /* out14 = b1 * a10[2], b1 =
-                                                   q6*/
-            "fmla   v30.4s,  v7.4s,  v1.s[3]\n" /* out15 = b1 * a10[3], b1 =
-                                                   q6*/
-            "fmla   v10.4s,  v4.4s,  v0.s[0]\n" /* out16 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v13.4s,  v4.4s,  v0.s[1]\n" /* out17 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v16.4s,  v4.4s,  v0.s[2]\n" /* out18 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v19.4s,  v4.4s,  v0.s[3]\n" /* out19 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v22.4s,  v4.4s,  v1.s[0]\n" /* out20 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v25.4s,  v4.4s,  v1.s[1]\n" /* out21 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v28.4s,  v4.4s,  v1.s[2]\n" /* out22 = b2 * a10[0], b2 =
-                                                   q7*/
-            "fmla   v31.4s,  v4.4s,  v1.s[3]\n" /* out23 = b2 * a10[0], b2 =
-                                                   q7*/
-            "11: \n"                            /* check if relu */
-            "cbz    %w[relu],   12f\n"          /* skip relu */
-            "movi   v2.4s, #0\n"                /* for relu*/
-            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
-            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
-            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
-            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
-            "fmax   v12.4s, v12.4s, v2.4s\n"    /* relu*/
-            "fmax   v13.4s, v13.4s, v2.4s\n"    /* relu*/
-            "fmax   v14.4s, v14.4s, v2.4s\n"    /* relu*/
-            "fmax   v15.4s, v15.4s, v2.4s\n"    /* relu*/
-            "fmax   v16.4s,v16.4s,v2.4s\n"      /* relu*/
-            "fmax   v17.4s,v17.4s,v2.4s\n"      /* relu*/
-            "fmax   v18.4s, v18.4s, v2.4s\n"    /* relu*/
-            "fmax   v19.4s, v19.4s, v2.4s\n"    /* relu*/
-            "fmax   v20.4s, v20.4s, v2.4s\n"    /* relu*/
-            "fmax   v21.4s, v21.4s, v2.4s\n"    /* relu*/
-            "fmax   v22.4s, v22.4s, v2.4s\n"    /* relu*/
-            "fmax   v23.4s, v23.4s, v2.4s\n"    /* relu*/
-            "fmax   v24.4s,v24.4s,v2.4s\n"      /* relu*/
-            "fmax   v25.4s,v25.4s,v2.4s\n"      /* relu*/
-            "fmax   v26.4s, v26.4s, v2.4s\n"    /* relu*/
-            "fmax   v27.4s, v27.4s, v2.4s\n"    /* relu*/
-            "fmax   v28.4s, v28.4s, v2.4s\n"    /* relu*/
-            "fmax   v29.4s, v29.4s, v2.4s\n"    /* relu*/
-            "fmax   v30.4s, v30.4s, v2.4s\n"    /* relu*/
-            "fmax   v31.4s, v31.4s, v2.4s\n"    /* relu*/
-            "12: \n"
-            "st1 {v8.4s, v9.4s, v10.4s},[%[c_ptr0]], #48\n"   /* store r0 */
-            "st1 {v11.4s, v12.4s, v13.4s},[%[c_ptr1]], #48\n" /* store r1 */
-            "st1 {v14.4s, v15.4s, v16.4s},[%[c_ptr2]], #48\n" /* store r2 */
-            "st1 {v17.4s, v18.4s, v19.4s},[%[c_ptr3]], #48\n" /* store r3 */
-            "st1 {v20.4s, v21.4s, v22.4s},[%[c_ptr4]], #48\n" /* store r4 */
-            "st1 {v23.4s, v24.4s, v25.4s},[%[c_ptr5]], #48\n" /* store r5 */
-            "st1 {v26.4s, v27.4s, v28.4s},[%[c_ptr6]], #48\n" /* store r6 */
-            "st1 {v29.4s, v30.4s, v31.4s},[%[c_ptr7]], #48\n" /* store r7 */
-
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [k] "+r"(k),
-              [tail] "+r"(tail), [c_ptr0] "+r"(c_ptr0), [c_ptr1] "+r"(c_ptr1),
-              [c_ptr2] "+r"(c_ptr2), [c_ptr3] "+r"(c_ptr3),
-              [c_ptr4] "+r"(c_ptr4), [c_ptr5] "+r"(c_ptr5),
-              [c_ptr6] "+r"(c_ptr6), [c_ptr7] "+r"(c_ptr7)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-              "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
-              "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-              "v29", "v30", "v31");
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-            *pout6++ = cout6[i];
-            *pout7++ = cout7[i];
-          }
-        }
-      }
-    }
-  }
-}
-#else  //__aarch64__
-/**
- * \brief gemm with ablock = 6, bblock = 8, output 6x8
- * @param A
- * @param B
- * @param C
- * @param M
- * @param N
- * @param K
- * @param threads
- * @param workspace
- */
-void sgemm_conv_6x8(const float* A_packed, const float* B, const float* bias,
-                    float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB) {
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  int l2_cache = l2_size > 0 ? l2_size : 512 * 1024;
-
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_OTH * K)) / (sizeof(float) * (K + MBLOCK_OTH));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  if (x_block != 0) {
-    int x_num = (N + (x_block - 1)) / x_block;
-    x_block = (N + x_num - 1) / x_num;
-    x_block = (x_block + NBLOCK - 1) / NBLOCK;
-    x_block *= NBLOCK;
-  }
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float* b_pannel =
-        static_cast<float*>(framework::CPUContext::Context()->get_work_space(
-            K * (xmax - x0) * sizeof(float)));
-    if (!transB) {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_OTH) {
-      unsigned int ymax = y + MBLOCK_OTH;
-      if (ymax > M) {
-        ymax = M;
-      }
-      float* c_ptr0 = C + y * N + x0;
-      float* c_ptr1 = c_ptr0 + N;
-      float* c_ptr2 = c_ptr1 + N;
-      float* c_ptr3 = c_ptr2 + N;
-      float* c_ptr4 = c_ptr3 + N;
-      float* c_ptr5 = c_ptr4 + N;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-      float* pout4 = c_ptr4;
-      float* pout5 = c_ptr5;
-
-      float bias_local[6] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-        bias_local[4] = bias[y + 4];
-        bias_local[5] = bias[y + 5];
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-      float cout4[NBLOCK];
-      float cout5[NBLOCK];
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 5) >= ymax) {
-          switch ((y + 5) - ymax) {
-            case 4:
-              c_ptr1 = cout1;
-            case 3:
-              c_ptr2 = cout2;
-            case 2:
-              c_ptr3 = cout3;
-            case 1:
-              c_ptr4 = cout4;
-            case 0:
-              c_ptr5 = cout5;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-          pout4 = c_ptr4;
-          pout5 = c_ptr5;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-          c_ptr4 = cout4;
-          c_ptr5 = cout5;
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            // sgemm 6x8
-            "vld1.32    {d2-d4}, [%[bias_ptr]]      @ load bias 6 elements\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "pld [%[a_ptr]]                         @ preload a\n"
-            "vdup.i32   q12,d4[0]                   @ out40=0\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.i32   q13,d4[0]                   @ out41=0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.i32   q14,d4[1]                   @ out50=0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.i32   q15,d4[1]                   @ out51=0\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.i32   q4, d2[0]                   @ out00=0\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.i32   q5, d2[0]                   @ out01=0\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vdup.i32   q6, d2[1]                   @ out10=0\n"
-            "pld [%[a_ptr], #192]                   @ preload a\n"
-            "vdup.i32   q7, d2[1]                   @ out11=0\n"
-            "pld [%[b_ptr], #192]                   @ preload a\n"
-            "vdup.i32   q8, d3[0]                   @ out20=0\n"
-            "pld [%[a_ptr], #256]                   @ preload a\n"
-            "vdup.i32   q9, d3[0]                   @ out21=0\n"
-            "pld [%[b_ptr], #256]                   @ preload a\n"
-            "vdup.i32   q10,d3[1]                   @ out30=0\n"
-            "pld [%[b_ptr], #320]                   @ preload b\n"
-            "vdup.i32   q11,d3[1]                   @ out31=0\n"
-            "pld [%[b_ptr], #384]                   @ preload b\n"
-            "cmp %[k], #0                           @ check weather k is "
-            "bigger than 0\n"
-            "beq 0f                                 @ jump to tail\n"
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a4, a5, and next "
-            "a0, a1\n"
-            "vmla.f32   q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 1 */
-            "vmla.f32   q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            /*"pld [%[a_ptr], #64]                    @ preload a\n"*/
-            "vmla.f32   q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[b_ptr], #192]\n"*/
-            "vmla.f32   q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32   q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a4, a5, a0, a1\n"
-            /* Unroll 2 */
-            "vmla.f32   q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            /*"pld [%[a_ptr], #240]                   @ preload\n"*/
-            "vmla.f32   q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            /*"pld [%[b_ptr], #208]\n"*/
-            "vmla.f32   q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3 */
-            "vmla.f32   q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d3[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32   q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "subs       %[k], %[k], #1              @ k--\n"
-            "vmla.f32   q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "bne        1b                          @ jump to main loop\n"
-            "0:                                     @ process tail\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "beq        3f                          @ jump to tail = 1\n"
-            /* Unroll 0*/
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a4,5, a0, a1\n"
-            "vmla.f32   q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq        4f                          @ jump to tail==2\n"
-            /* Unroll 1*/
-            "vmla.f32   q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a0~a3\n"
-            "vmla.f32   q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "beq        5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32    {d0-d1}, [%[a_ptr] :64]!    @ load a4,a5, a0,a1\n"
-            "vmla.f32   q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vld1.32    {d4-d5}, [%[b_ptr] :128]!   @ load b1\n"
-            "vmla.f32   q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d2-d3}, [%[a_ptr] :64]!    @ load a2~a5\n"
-            "vmla.f32   q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            /* Unroll 3*/
-            "vmla.f32   q4, q2, d1[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d1[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d2[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d2[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d3[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d3[1]              @ out5  += b1 * a5\n"
-            "vmla.f32   q5, q3, d1[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d1[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d2[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d2[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d3[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d3[1]              @ out11 += b2 * a5\n"
-            "b      2f\n"
-            /* tails==1 final tail*/
-            "3:                                     @ tail=1\n"
-            "vmla.f32   q4, q2, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d2}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32   q6, q2, d0[1]               @ out1 += b1 * a1\n"
-            "vld1.32    {d6-d7}, [%[b_ptr] :128]!   @ load b2\n"
-            "vmla.f32   q8, q2, d1[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d2[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d2[1]              @ out5 += b1 * a5\n"
-            "vmla.f32   q5, q3, d0[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d0[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d1[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d1[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d2[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d2[1]              @ out11 += b2 * a5\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32   q4, q2, d3[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q6, q2, d3[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d0[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d0[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d1[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d1[1]              @ out5 += b1 * a5\n"
-            "vmla.f32   q5, q3, d3[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d3[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d0[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d0[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d1[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d1[1]              @ out11 += b2 * a5\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32   q4, q2, d2[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d0}, [%[a_ptr] :64]!       @ load a4,a5\n"
-            "vmla.f32   q6, q2, d2[1]               @ out1 += b1 * a1\n"
-            "vmla.f32   q8, q2, d3[0]               @ out2 += b1 * a2\n"
-            "vmla.f32   q10, q2, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q12, q2, d0[0]              @ out4 += b1 * a4\n"
-            "vmla.f32   q14, q2, d0[1]              @ out5 += b1 * a5\n"
-            "vmla.f32   q5, q3, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q7, q3, d2[1]               @ out7 += b2 * a1\n"
-            "vmla.f32   q9, q3, d3[0]               @ out8 += b2 * a2\n"
-            "vmla.f32   q11, q3, d3[1]              @ out9 += b2 * a3\n"
-            "vmla.f32   q13, q3, d0[0]              @ out10 += b2 * a4\n"
-            "vmla.f32   q15, q3, d0[1]              @ out11 += b2 * a5\n"
-            "2:                                     @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q4, q4, q0                  @ for relu\n"
-            "vmax.f32   q5, q5, q0                  @ for relu\n"
-            "vmax.f32   q6, q6, q0                  @ for relu\n"
-            "vmax.f32   q7, q7, q0                  @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d8-d11},   [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d12-d15},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d16-d19},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d20-d23},  [%[c_ptr3]]!    @ store r3\n"
-            "vst1.32    {d24-d27},  [%[c_ptr4]]!    @ store r4\n"
-            "vst1.32    {d28-d31},  [%[c_ptr5]]!    @ store r5\n"
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3), [c_ptr4] "+r"(c_ptr4),
-              [c_ptr5] "+r"(c_ptr5), [k] "+r"(k), [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-            *pout4++ = cout4[i];
-            *pout5++ = cout5[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-void sgemm_conv_4x8(const float* A_packed, const float* B, const float* bias,
-                    float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                    bool transB) {
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  int l2_cache = l2_size > 0 ? l2_size : 512 * 1024;
-
-  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
-  int x_block =
-      (l2_cache - (MBLOCK_A73 * K)) / (sizeof(float) * (K + MBLOCK_A73));
-  x_block /= NBLOCK;
-  x_block *= NBLOCK;
-  int x_num = (N + (x_block - 1)) / x_block;
-  x_block = (N + x_num - 1) / x_num;
-  x_block = (x_block + NBLOCK - 1) / NBLOCK;
-  x_block *= NBLOCK;
-  x_block = x_block < NBLOCK ? NBLOCK : x_block;
-
-  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
-  int tail_pre = (K & (KBLOCK - 1));
-  if (tail_pre == 0) {
-    tail_pre = KBLOCK;
-  }
-
-  bool flag_p_remain = false;
-  int remain = 0;
-
-  //! apanel is pre_compute outside gemm
-  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
-    unsigned int xmax = x0 + x_block;
-    if (xmax > N) {
-      xmax = N;
-    }
-    int bblocks = (xmax - x0 + NBLOCK - 1) / NBLOCK;
-    remain = xmax - x0 - (bblocks - 1) * NBLOCK;
-    if (remain > 0) {
-      flag_p_remain = true;
-    }
-    //! load bpanel
-    float* b_pannel =
-        static_cast<float*>(framework::CPUContext::Context()->get_work_space(
-            K * (xmax - x0) * sizeof(float)));
-
-    if (!transB) {
-      loadb(b_pannel, B, N, 0, K, x0, xmax);
-    }
-#pragma omp parallel for num_threads(threads)
-    for (unsigned int y = 0; y < M; y += MBLOCK_A73) {
-      unsigned int ymax = y + MBLOCK_A73;
-      if (ymax > M) {
-        ymax = M;
-      }
-
-      float cout0[NBLOCK];
-      float cout1[NBLOCK];
-      float cout2[NBLOCK];
-      float cout3[NBLOCK];
-
-      float bias_local[4] = {0};
-      if (is_bias) {
-        bias_local[0] = bias[y];
-        bias_local[1] = bias[y + 1];
-        bias_local[2] = bias[y + 2];
-        bias_local[3] = bias[y + 3];
-      }
-
-      float* c_ptr0 = C + y * N + x0;
-      float* c_ptr1 = c_ptr0 + N;
-      float* c_ptr2 = c_ptr1 + N;
-      float* c_ptr3 = c_ptr2 + N;
-
-      float* pout0 = c_ptr0;
-      float* pout1 = c_ptr1;
-      float* pout2 = c_ptr2;
-      float* pout3 = c_ptr3;
-
-      const float* a_ptr_l = A_packed + y * K;
-      const float* b_ptr = b_pannel;
-      for (int xb = 0; xb < bblocks; xb++) {
-        if ((y + 3) >= ymax) {
-          switch ((y + 3) - ymax) {
-            case 2:
-              c_ptr1 = cout1;
-            case 1:
-              c_ptr2 = cout1;
-            case 0:
-              c_ptr3 = cout1;
-            default:
-              break;
-          }
-        }
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          pout0 = c_ptr0;
-          pout1 = c_ptr1;
-          pout2 = c_ptr2;
-          pout3 = c_ptr3;
-
-          c_ptr0 = cout0;
-          c_ptr1 = cout1;
-          c_ptr2 = cout2;
-          c_ptr3 = cout3;
-        }
-        const float* a_ptr = a_ptr_l;
-        int tails = tail_pre;
-        int k = k_pre;
-        asm volatile(
-            "vld1.32    {d4-d5}, [%[bias_ptr]]      @ load bias\n"
-            "vld1.32    {d0-d3}, [%[a_ptr] :128]!   @ load a0~a3\n"
-            "vdup.32    q8, d4[0]                   @ add bias to out00\n"
-            "pld [%[a_ptr]]                         @ preload a, 64byte\n"
-            "vdup.32    q9, d4[0]                   @ add bias to out01\n"
-            "pld [%[b_ptr]]                         @ preload b\n"
-            "vdup.32    q10, d4[1]                  @ add bias to out10\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vdup.32    q11, d4[1]                  @ add bias to out11\n"
-            "vld1.32   {d8-d11}, [%[b_ptr] :128]!   @ load b1\n"
-            "vdup.32    q12, d5[0]                  @ add bias to out20\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vdup.32    q13, d5[0]                  @ add bias to out21\n"
-            "pld [%[a_ptr], #128]                   @ preload a\n"
-            "vdup.32    q14, d5[1]                  @ add bias to out30\n"
-            "pld [%[b_ptr], #128]                   @ preload b\n"
-            "vdup.32    q15, d5[1]                  @ add bias to out31\n"
-            "pld [%[b_ptr], #192]                   @ preload b\n"
-            "cmp %[k], #0                           @ check weather k is "
-            "bigger than 0\n"
-            "beq 0f                                 @ jump to tail\n"
-
-            "1:                                     @ main loop for k\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32   q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32   q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "vld1.32    {d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 1 */
-            "vmla.f32   q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "pld [%[b_ptr], #64]                    @ preload b\n"
-            "vmla.f32   q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32   q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32   q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "vld1.32    {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            /* Unroll 2 */
-            "vmla.f32   q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vld1.32    {d0-d3}, [%[a_ptr] :128]!   @ load next a0~a3\n"
-            "vmla.f32   q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            "vld1.32    {d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            /* Unroll 3 */
-            "vmla.f32   q8, q6, d6[0]               @ out0 += b1 * a0\n"
-            "pld [%[a_ptr], #64]                    @ preload a\n"
-            "vmla.f32   q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "subs       %[k], %[k], #1              @ k--\n"
-            "bne        1b                          @ jump to main loop\n"
-
-            "0:                                     @ process tail\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "beq        3f                          @ jump to tail = 1\n"
-            /* Unroll 0*/
-            "vld1.32  {d12-d15}, [%[b_ptr] :128]!   @ load next b1, b2\n"
-            "vmla.f32   q8, q4, d0[0]               @ out0 += b1 * a0\n"  // b1*a1
-            "vmla.f32   q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            "beq        4f                          @ jump to tail==2\n"
-            /* Unroll 1 */
-            "vld1.32    {d8-d11}, [%[b_ptr] :128]!  @ load next b1, b2\n"
-            "vmla.f32   q8, q6, d2[0]               @ out0 += b1 * a0\n"  // b6*a2
-            "vld1.32    {d4-d7}, [%[a_ptr] :128]!   @ load next 2xa0~a3\n"
-            "vmla.f32   q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "subs       %[tails], %[tails], #1      @ tail--\n"
-            "vmla.f32   q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d2[0]               @ out6 += b2 * a0\n"
-            "vmla.f32   q11, q7, d2[1]              @ out7 += b2 * a1\n"
-            "vmla.f32   q13, q7, d3[0]              @ out8 += b2 * a2\n"
-            "vmla.f32   q15, q7, d3[1]              @ out9 += b2 * a3\n"
-            "beq        5f                          @ jump to tail==3\n"
-            /* Unroll 2 */
-            "vld1.32    {d12-d15}, [%[b_ptr] :128]! @ load next b1,b2\n"
-            "vmla.f32   q8, q4, d4[0]               @ out0 += b1 * a0\n"  // b11
-                                                                          // *
-                                                                          // a3
-            "vmla.f32   q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /* Unroll 3 */
-            "vmla.f32   q8, q6, d6[0]               @ out0 += b1 * a0\n"  // b16
-                                                                          // *
-                                                                          // a4
-            "vmla.f32   q10, q6, d6[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d7[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d7[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d6[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q7, d6[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q7, d7[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q7, d7[1]              @ out7 += b2 * a3\n"
-            "b      2f\n"
-            /* tails==1 final tail */
-            "3:                                     @ tail=1\n"
-            "vmla.f32   q8, q4, d0[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q10, q4, d0[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d1[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d1[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d0[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d0[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d1[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d1[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16 */
-            "sub        %[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==2 final tail*/
-            "4:                                     @ tail == 2\n"
-            "vmla.f32   q8, q6, d2[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q10, q6, d2[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q6, d3[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q6, d3[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q7, d2[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q7, d2[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q7, d3[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q7, d3[1]              @ out7 += b2 * a3\n"
-            "b      2f                              @ jump to end\n"
-            /* tails==3 final tail*/
-            "5:                                     @ tail=3\n"
-            "vmla.f32   q8, q4, d4[0]               @ out0 += b1 * a0\n"
-            "vmla.f32   q10, q4, d4[1]              @ out1 += b1 * a1\n"
-            "vmla.f32   q12, q4, d5[0]              @ out2 += b1 * a2\n"
-            "vmla.f32   q14, q4, d5[1]              @ out3 += b1 * a3\n"
-            "vmla.f32   q9, q5, d4[0]               @ out4 += b2 * a0\n"
-            "vmla.f32   q11, q5, d4[1]              @ out5 += b2 * a1\n"
-            "vmla.f32   q13, q5, d5[0]              @ out6 += b2 * a2\n"
-            "vmla.f32   q15, q5, d5[1]              @ out7 += b2 * a3\n"
-            /*aptr - 16*/
-            "sub        %[a_ptr], %[a_ptr], #16     @ tail--\n"
-            "2:                                      @ check relu\n"
-            "cmp    %[relu], #0                     @ check if has relu\n"
-            "ble    6f                              @ skip relu if relu <= 0\n"
-            "vmov.u32    q0, #0                     @ for relu\n"
-            "vmax.f32   q8, q8, q0                  @ for relu\n"
-            "vmax.f32   q9, q9, q0                  @ for relu\n"
-            "vmax.f32   q10, q10, q0                @ for relu\n"
-            "vmax.f32   q11, q11, q0                @ for relu\n"
-            "vmax.f32   q12, q12, q0                @ for relu\n"
-            "vmax.f32   q13, q13, q0                @ for relu\n"
-            "vmax.f32   q14, q14, q0                @ for relu\n"
-            "vmax.f32   q15, q15, q0                @ for relu\n"
-            "6:                                     @ store result\n"
-            "vst1.32    {d16-d19},  [%[c_ptr0]]!    @ store r0\n"
-            "vst1.32    {d20-d23},  [%[c_ptr1]]!    @ store r1\n"
-            "vst1.32    {d24-d27},  [%[c_ptr2]]!    @ store r2\n"
-            "vst1.32    {d28-d31},  [%[c_ptr3]]!    @ store r3\n"
-            : [a_ptr] "+r"(a_ptr), [b_ptr] "+r"(b_ptr), [c_ptr0] "+r"(c_ptr0),
-              [c_ptr1] "+r"(c_ptr1), [c_ptr2] "+r"(c_ptr2),
-              [c_ptr3] "+r"(c_ptr3), [k] "+r"(k), [tails] "+r"(tails)
-            : [bias_ptr] "r"(bias_local), [relu] "r"(is_relu)
-            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-              "q11", "q12", "q13", "q14", "q15", "cc", "memory");
-
-        if (flag_p_remain && (xb == bblocks - 1)) {
-          for (int i = 0; i < remain; ++i) {
-            *pout0++ = cout0[i];
-            *pout1++ = cout1[i];
-            *pout2++ = cout2[i];
-            *pout3++ = cout3[i];
-          }
-        }
-      }
-    }
-  }
-}
-
-#endif  //__aarch64__
-/// a: m*k  b: k*n  c: m*n
-void sgemm_prepack(const float *A_packed, const float *B, const float *bias,
-                   float *C, int M, int N, int K, bool is_bias, bool is_relu,
-                   bool is_transB, ARMArch arch) {
-#ifdef __aarch64__
-  sgemm_conv_8x12(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB);
-#else   // armv7
-  if (arch == A73) {
-    sgemm_conv_4x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB);
-  } else {
-    sgemm_conv_6x8(A_packed, B, bias, C, M, N, K, is_bias, is_relu, is_transB);
-  }
-#endif  // arm64
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONV_OP
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/gemm/gemm1x1s1.h b/mobile/src/operators/math/gemm/gemm1x1s1.h
deleted file mode 100644
index 19dcdccdb9aeb65e073646f9dc5ea9452ff96d46..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/gemm1x1s1.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#ifdef __aarch64__
-const int MBLOCK = 8;
-const int NBLOCK = 12;
-const int KBLOCK = 4;
-inline int get_hblock(ARMArch arch) { return MBLOCK; }
-#else
-const int MBLOCK_A73 = 4;
-const int MBLOCK_OTH = 6;
-const int NBLOCK = 8;
-const int KBLOCK = 4;
-
-inline int get_hblock(ARMArch arch) {
-  if (arch == A73) {
-    return MBLOCK_A73;
-  } else {
-    return MBLOCK_OTH;
-  }
-}
-#endif  // __aarch64__
-
-void gemm1x1s1_transform_weight(const framework::Tensor& weight,
-                                const framework::Tensor& output,
-                                framework::Tensor* trans_weight,
-                                const int group, ARMArch arch);
-
-void sgemm_prepack(const float* A_packed, const float* B, const float* bias,
-                   float* C, int M, int N, int K, bool is_bias, bool is_relu,
-                   bool is_transB, ARMArch arch);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONV_OP
diff --git a/mobile/src/operators/math/gemm/gemm_kernel.h b/mobile/src/operators/math/gemm/gemm_kernel.h
deleted file mode 100644
index 0f3089b2045124bc4bde816c96fea93433a4d630..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/gemm_kernel.h
+++ /dev/null
@@ -1,792 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#include <string.h>
-#include "operators/math/math.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#if __aarch64__
-void sgemm_6x16(const float *lhs, const float *rhs, const int k, float *output,
-                const int ldc) {
-  int kc1 = k;
-  int step = 4 * ldc;
-  int step1 = 4 * 6;
-  asm volatile(
-      "dup      v6.4s,     wzr     \n\t"
-      "dup      v7.4s,     wzr     \n\t"
-      "dup      v8.4s,     wzr     \n\t"
-      "dup      v9.4s,     wzr     \n\t"
-      "dup      v10.4s,    wzr     \n\t"
-      "dup      v11.4s,    wzr     \n\t"
-      "dup      v12.4s,    wzr     \n\t"
-      "dup      v13.4s,    wzr     \n\t"
-
-      "dup      v14.4s,    wzr     \n\t"
-      "dup      v15.4s,    wzr     \n\t"
-      "dup      v16.4s,    wzr     \n\t"
-      "dup      v17.4s,    wzr     \n\t"
-      "dup      v18.4s,    wzr     \n\t"
-      "dup      v19.4s,    wzr     \n\t"
-      "dup      v20.4s,    wzr     \n\t"
-      "dup      v21.4s,    wzr     \n\t"
-
-      "dup      v22.4s,    wzr     \n\t"
-      "dup      v23.4s,    wzr     \n\t"
-      "dup      v24.4s,    wzr     \n\t"
-      "dup      v25.4s,    wzr     \n\t"
-      "dup      v26.4s,    wzr     \n\t"
-      "dup      v27.4s,    wzr     \n\t"
-      "dup      v28.4s,    wzr     \n\t"
-      "dup      v29.4s,    wzr     \n\t"
-
-      "subs     %[kc1], %[kc1], #1          \n\t"
-      "blt      2f                          \n\t"
-      "1:                                   \n\t"
-
-      "prfm     pldl1keep,  [%[lhs],  #32]  \n\t"
-      "prfm     pldl1keep,  [%[rhs],  #64]  \n\t"
-
-      "ld1      {v0.4s, v1.4s},  [%[lhs]],    %[step1]      \n\t"
-      "ld1      {v2.4s, v3.4s, v4.4s, v5.4s}, [%[rhs]], #64 \n\t"
-
-      "fmla     v6.4s,    v2.4s,   v0.s[0]       \n\t"
-      "fmla     v7.4s,    v3.4s,   v0.s[0]       \n\t"
-      "fmla     v8.4s,    v4.4s,   v0.s[0]       \n\t"
-      "fmla     v9.4s,    v5.4s,   v0.s[0]       \n\t"
-
-      "fmla     v10.4s,   v2.4s,   v0.s[1]       \n\t"
-      "fmla     v11.4s,   v3.4s,   v0.s[1]       \n\t"
-      "fmla     v12.4s,   v4.4s,   v0.s[1]       \n\t"
-      "fmla     v13.4s,   v5.4s,   v0.s[1]       \n\t"
-
-      "fmla     v14.4s,   v2.4s,   v0.s[2]       \n\t"
-      "fmla     v15.4s,   v3.4s,   v0.s[2]       \n\t"
-      "fmla     v16.4s,   v4.4s,   v0.s[2]       \n\t"
-      "fmla     v17.4s,   v5.4s,   v0.s[2]       \n\t"
-
-      "fmla     v18.4s,   v2.4s,   v0.s[3]       \n\t"
-      "fmla     v19.4s,   v3.4s,   v0.s[3]       \n\t"
-      "fmla     v20.4s,   v4.4s,   v0.s[3]       \n\t"
-      "fmla     v21.4s,   v5.4s,   v0.s[3]       \n\t"
-
-      "fmla     v22.4s,   v2.4s,   v1.s[0]       \n\t"
-      "fmla     v23.4s,   v3.4s,   v1.s[0]       \n\t"
-      "fmla     v24.4s,   v4.4s,   v1.s[0]       \n\t"
-      "fmla     v25.4s,   v5.4s,   v1.s[0]       \n\t"
-
-      "fmla     v26.4s,   v2.4s,   v1.s[1]       \n\t"
-      "fmla     v27.4s,   v3.4s,   v1.s[1]       \n\t"
-      "fmla     v28.4s,   v4.4s,   v1.s[1]       \n\t"
-      "fmla     v29.4s,   v5.4s,   v1.s[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "st1      {v6.4s,  v7.4s,  v8.4s,  v9.4s},    [%[c]],   %[step]   \n\t"
-      "st1      {v10.4s, v11.4s, v12.4s, v13.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v14.4s, v15.4s, v16.4s, v17.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v18.4s, v19.4s, v20.4s, v21.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v22.4s, v23.4s, v24.4s, v25.4s},   [%[c]],   %[step]   \n\t"
-      "st1      {v26.4s, v27.4s, v28.4s, v29.4s},   [%[c]],   %[step]   \n\t"
-      : [lhs] "+r"(lhs), [rhs] "+r"(rhs), [c] "+r"(output), [kc1] "+r"(kc1)
-      : [step] "r"(step), [step1] "r"(step1)
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8",
-        "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
-        "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
-        "v29");
-}
-#else
-void sgemm_6x8(const float *lhs, const float *rhs, const int k, float *output,
-               const int ldc) {
-  int kc1 = k >> 3;   // k / 8
-  int kc2 = k & 0x7;  // k % 8
-  int step = sizeof(float) * ldc;
-  asm volatile(
-      "pld        [%[lhs]]            \n\t"
-      "pld        [%[lhs],  #64]      \n\t"
-      "pld        [%[rhs]]            \n\t"
-      "pld        [%[rhs],  #64]      \n\t"
-
-      "vmov.f32   q4,     #0.0          \n\t"
-      "vmov.f32   q5,     #0.0          \n\t"
-      "vmov.f32   q6,     #0.0          \n\t"
-      "vmov.f32   q7,     #0.0          \n\t"
-      "vmov.f32   q8,     #0.0          \n\t"
-      "vmov.f32   q9,     #0.0          \n\t"
-      "vmov.f32   q10,    #0.0          \n\t"
-      "vmov.f32   q11,    #0.0          \n\t"
-      "vmov.f32   q12,    #0.0          \n\t"
-      "vmov.f32   q13,    #0.0          \n\t"
-      "vmov.f32   q14,    #0.0          \n\t"
-      "vmov.f32   q15,    #0.0          \n\t"
-
-      "subs       %[kc1], %[kc1], #1    \n\t"
-      "blt        2f                    \n\t"
-      "1:                               \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "pld        [%[lhs], #128]       \n\t"
-      "pld        [%[rhs], #128]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc1], %[kc1], #1      \n\t"
-      "bge        1b                      \n\t"
-      "2:                                 \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "blt        4f                      \n\t"
-      "3:                                 \n\t"
-
-      "vld1.32    {d0-d2},  [%[lhs]]!   \n\t"
-      "vld1.32    {q2, q3}, [%[rhs]]!   \n\t"
-
-      "vmla.f32   q4,   q2,   d0[0]       \n\t"
-      "vmla.f32   q5,   q3,   d0[0]       \n\t"
-      "vmla.f32   q6,   q2,   d0[1]       \n\t"
-      "vmla.f32   q7,   q3,   d0[1]       \n\t"
-      "vmla.f32   q8,   q2,   d1[0]       \n\t"
-      "vmla.f32   q9,   q3,   d1[0]       \n\t"
-      "vmla.f32   q10,  q2,   d1[1]       \n\t"
-      "vmla.f32   q11,  q3,   d1[1]       \n\t"
-      "vmla.f32   q12,  q2,   d2[0]       \n\t"
-      "vmla.f32   q13,  q3,   d2[0]       \n\t"
-      "vmla.f32   q14,  q2,   d2[1]       \n\t"
-      "vmla.f32   q15,  q3,   d2[1]       \n\t"
-
-      "subs       %[kc2], %[kc2], #1      \n\t"
-      "bge        3b                      \n\t"
-      "4:                                 \n\t"
-
-      "mov        r5,     %[c]            \n\t"
-      "mov        r6,     %[step]         \n\t"
-      "vst1.32    {q4, q5},   [r5], r6    \n\t"
-      "vst1.32    {q6, q7},   [r5], r6    \n\t"
-      "vst1.32    {q8, q9},   [r5], r6    \n\t"
-      "vst1.32    {q10, q11}, [r5], r6    \n\t"
-      "vst1.32    {q12, q13}, [r5], r6    \n\t"
-      "vst1.32    {q14, q15}, [r5]        \n\t"
-      :
-      : [lhs] "r"(lhs), [rhs] "r"(rhs), [c] "r"(output), [kc1] "r"(kc1),
-        [kc2] "r"(kc2), [step] "r"(step)
-      : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-        "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-}
-#endif  // __aarch64__
-
-void sgemv_notrans_mx1(const int M, const int N, const float alpha,
-                       const float *A, const int lda, const float *B,
-                       const float beta, float *C) {
-  uint32_t mask[4] = {0, 1, 2, 3};
-  int remain_n = N & 0x3;
-  uint32x4_t vmask = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n));
-  float32x4_t _valpha = vdupq_n_f32(alpha);
-
-  #pragma omp parallel for
-  for (int m = 0; m < M - 3; m += 4) {
-    const float *in0 = A + m * lda;
-    const float *in1 = in0 + lda;
-    const float *in2 = in1 + lda;
-    const float *in3 = in2 + lda;
-    float *output = C + m;
-
-    float32x4_t _sum0, _sum1, _sum2, _sum3;
-    _sum0 = vdupq_n_f32(0.f);
-    _sum1 = vdupq_n_f32(0.f);
-    _sum2 = vdupq_n_f32(0.f);
-    _sum3 = vdupq_n_f32(0.f);
-    int n = 0;
-    for (; n < N - 3; n += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _r1 = vld1q_f32(in1 + n);
-      float32x4_t _r2 = vld1q_f32(in2 + n);
-      float32x4_t _r3 = vld1q_f32(in3 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-      _sum1 = vmlaq_f32(_sum1, _r1, _b);
-      _sum2 = vmlaq_f32(_sum2, _r2, _b);
-      _sum3 = vmlaq_f32(_sum3, _r3, _b);
-    }
-    if (n < N) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _r1 = vld1q_f32(in1 + n);
-      float32x4_t _r2 = vld1q_f32(in2 + n);
-      float32x4_t _r3 = vld1q_f32(in3 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _r0 = vandq_f32_u32(_r0, vmask);
-      _r1 = vandq_f32_u32(_r1, vmask);
-      _r2 = vandq_f32_u32(_r2, vmask);
-      _r3 = vandq_f32_u32(_r3, vmask);
-      _b = vandq_f32_u32(_b, vmask);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-      _sum1 = vmlaq_f32(_sum1, _r1, _b);
-      _sum2 = vmlaq_f32(_sum2, _r2, _b);
-      _sum3 = vmlaq_f32(_sum3, _r3, _b);
-    }
-    _sum0 = vpaddq_f32(_sum0, _sum1);
-    _sum2 = vpaddq_f32(_sum2, _sum3);
-    _sum0 = vpaddq_f32(_sum0, _sum2);
-    _sum0 = vmulq_f32(_sum0, _valpha);
-    if (beta != 0.f) {
-      _sum2 = vmulq_n_f32(vld1q_f32(output), beta);
-      _sum0 = vaddq_f32(_sum0, _sum2);
-    }
-    // restore
-    vst1q_f32(output, _sum0);
-  }
-  // remain m
-  for (int m = (M & 0xfffffffc); m < M; ++m) {
-    const float *in0 = A + m * lda;
-    float *output = C + m;
-    float32x4_t _sum0 = vdupq_n_f32(0.f);
-
-    int n = 0;
-    for (; n < N - 3; n += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-    }
-    if (n < N) {
-      float32x4_t _r0 = vld1q_f32(in0 + n);
-      float32x4_t _b = vld1q_f32(B + n);
-      _r0 = vandq_f32_u32(_r0, vmask);
-      _b = vandq_f32_u32(_b, vmask);
-      _sum0 = vmlaq_f32(_sum0, _r0, _b);
-    }
-    float32x2_t _ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
-    float32x2_t _sss2 = vpadd_f32(_ss, _ss);
-    *output =
-        vget_lane_f32(_sss2, 0) * vgetq_lane_f32(_valpha, 0) + beta * (*output);
-  }
-}
-
-void sgemv_notrans_mx1_faster(const int M, const int N, const float alpha,
-                              const float *A, const int lda, const float *B,
-                              const float beta, float *C) {
-#pragma omp parallel for
-  for (int m = 0; m < M - 3; m += 4) {
-    const float *a_ptr0 = A + m * lda;
-    const float *a_ptr1 = a_ptr0 + lda;
-    const float *a_ptr2 = a_ptr1 + lda;
-    const float *a_ptr3 = a_ptr2 + lda;
-    const float *b_ptr = B;
-    float *c_ptr = C + m;
-    float sum0 = 0.f;
-    float sum1 = 0.f;
-    float sum2 = 0.f;
-    float sum3 = 0.f;
-    int n = 0;
-
-#if __ARM_NEON
-    /* matrix_mul_float:
-     * Calculate matrix A(4xN) * matrix B(Nx1) and store to a result array
-     * sum_arr[4], a 4x8 * 8x1 will be calculated on each iteration.
-     *
-     * Variable: a_ptr0 = pointer to the first row of matrix A, row major order
-     * Variable: a_ptr1 = pointer to the second row of matrix A, row major order
-     * Variable: a_ptr2 = pointer to the third row of matrix A, row major order
-     * Variable: a_ptr3 = pointer to the fourth row of matrix A, row major order
-     * Variable: b_ptr  = pointer to the first col of matrix B, col major order
-     * Variable: s_ptr  = pointer to the sum result array
-     * Variable: loop   = the numbers of loops
-     *
-     * Register: Q(V)4-Q(V)11  = matrix A
-     * Register: Q(V)0-Q(V)1   = matrix B
-     * Register: Q(V)12-Q(V)15 = matrix C
-     */
-
-    float sum_arr[4] = {0.f};
-    float *s_ptr = sum_arr;
-    int loop = N / 8;
-
-#if __aarch64__
-
-    if (loop > 0) {
-      asm volatile(
-          // set v12-v15 to 0
-          "movi   v12.4s,            #0             \n"
-          "movi   v13.4s,            #0             \n"
-          "movi   v14.4s,            #0             \n"
-          "movi   v15.4s,            #0             \n"
-
-          "0:                                       \n"
-          // load A and B
-          "ld1   {v0.4s, v1.4s},   [%[b_ptr]] , #32 \n"
-          "ld1   {v4.4s, v5.4s},   [%[a_ptr0]], #32 \n"
-          "ld1   {v6.4s, v7.4s},   [%[a_ptr1]], #32 \n"
-          "ld1   {v8.4s, v9.4s},   [%[a_ptr2]], #32 \n"
-          "ld1   {v10.4s, v11.4s}, [%[a_ptr3]], #32 \n"
-
-          "fmla   v12.4s, v4.4s,  v0.4s             \n"  // s0=A(r0c0-r0c3)*B(r0-r3)
-          "fmla   v13.4s, v6.4s,  v0.4s             \n"  // s1=A(r1c0-r1c3)*B(r0-r3)
-          "fmla   v14.4s, v8.4s,  v0.4s             \n"  // s2=A(r2c0-r2c3)*B(r0-r3)
-          "fmla   v15.4s, v10.4s, v0.4s             \n"  // s3=A(r3c0-r3c3)*B(r0-r3)
-
-          "fmla   v12.4s, v5.4s,  v1.4s             \n"  // s0=A(r0c4-r0c7)*B(r4-r7)
-          "fmla   v13.4s, v7.4s,  v1.4s             \n"  // s1=A(r1c4-r1c7)*B(r4-r7)
-          "fmla   v14.4s, v9.4s,  v1.4s             \n"  // s2=A(r2c4-r2c7)*B(r4-r7)
-          "fmla   v15.4s, v11.4s, v1.4s             \n"  // s3=A(r3c4-r3c7)*B(r4-r7)
-
-          // cycle
-          "subs   %[loop], %[loop], #1              \n"
-          "bne    0b                                \n"
-
-          // add and store
-          "faddp   v4.4s, v12.4s, v13.4s            \n"
-          "faddp   v5.4s, v14.4s, v15.4s            \n"
-          "faddp   v6.4s, v4.4s, v5.4s              \n"
-          "st1    {v6.4s}, [%[s_ptr]]               \n"
-
-          : [loop] "+r"(loop), [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1),
-            [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), [b_ptr] "+r"(b_ptr)
-          : [s_ptr] "r"(s_ptr)
-          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
-            "v13", "v14", "v15", "cc", "memory");
-    }
-#else   // __aarch64__
-
-    if (loop > 0) {
-      asm volatile(
-
-          // set Q12-Q15 to 0
-          "vmov.i32    q12,       #0           \n"
-          "vmov.i32    q13,       #0           \n"
-          "vmov.i32    q14,       #0           \n"
-          "vmov.i32    q15,       #0           \n"
-
-          "0:                                  \n"
-          // load A and B
-          "vld1.f32    {d0-d3},   [%[b_ptr]]!  \n"
-          "vld1.f32    {d8-d11},  [%[a_ptr0]]! \n"
-          "vld1.f32    {d12-d15}, [%[a_ptr1]]! \n"
-          "vld1.f32    {d16-d19}, [%[a_ptr2]]! \n"
-          "vld1.f32    {d20-d23}, [%[a_ptr3]]! \n"
-
-          "vmla.f32    q12, q4,   q0           \n"  // s0=A(r0c0-r0c3)*B(r0-r3)
-          "vmla.f32    q13, q6,   q0           \n"  // s1=A(r1c0-r1c3)*B(r0-r3)
-          "vmla.f32    q14, q8,   q0           \n"  // s2=A(r2c0-r2c3)*B(r0-r3)
-          "vmla.f32    q15, q10,  q0           \n"  // s3=A(r3c0-r3c3)*B(r0-r3)
-
-          "vmla.f32    q12, q5,   q1           \n"  // s0=A(r0c4-r0c7)*B(r4-r7)
-          "vmla.f32    q13, q7,   q1           \n"  // s1=A(r1c4-r1c7)*B(r4-r7)
-          "vmla.f32    q14, q9,   q1           \n"  // s2=A(r2c4-r2c7)*B(r4-r7)
-          "vmla.f32    q15, q11,  q1           \n"  // s3=A(r3c4-r3c7)*B(r4-r7)
-
-          // cycle
-          "subs        %[loop],   #1           \n"
-          "bne         0b                      \n"
-          // add and store
-          "vpadd.f32   d8, d24,   d25          \n"
-          "vpadd.f32   d9, d26,   d27          \n"
-          "vpadd.f32   d10, d28,  d29          \n"
-          "vpadd.f32   d11, d30,  d31          \n"
-
-          "vpadd.f32   d12, d8,   d9           \n"
-          "vpadd.f32   d13, d10,  d11          \n"
-          "vst1.32     {d12-d13}, [%[s_ptr]]   \n"
-
-          : [loop] "+r"(loop), [a_ptr0] "+r"(a_ptr0), [a_ptr1] "+r"(a_ptr1),
-            [a_ptr2] "+r"(a_ptr2), [a_ptr3] "+r"(a_ptr3), [b_ptr] "+r"(b_ptr)
-          : [s_ptr] "r"(s_ptr)
-          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12",
-            "q13", "q14", "q15", "cc", "memory");
-    }
-#endif  // __aarch64__
-    sum0 += s_ptr[0];
-    sum1 += s_ptr[1];
-    sum2 += s_ptr[2];
-    sum3 += s_ptr[3];
-    n = N - (N & 0x07);
-#endif  // __ARM_NEON
-
-    for (; n < N - 7; n += 8) {
-      sum0 += a_ptr0[0] * b_ptr[0];
-      sum1 += a_ptr1[0] * b_ptr[0];
-      sum2 += a_ptr2[0] * b_ptr[0];
-      sum3 += a_ptr3[0] * b_ptr[0];
-
-      sum0 += a_ptr0[1] * b_ptr[1];
-      sum1 += a_ptr1[1] * b_ptr[1];
-      sum2 += a_ptr2[1] * b_ptr[1];
-      sum3 += a_ptr3[1] * b_ptr[1];
-
-      sum0 += a_ptr0[2] * b_ptr[2];
-      sum1 += a_ptr1[2] * b_ptr[2];
-      sum2 += a_ptr2[2] * b_ptr[2];
-      sum3 += a_ptr3[2] * b_ptr[2];
-
-      sum0 += a_ptr0[3] * b_ptr[3];
-      sum1 += a_ptr1[3] * b_ptr[3];
-      sum2 += a_ptr2[3] * b_ptr[3];
-      sum3 += a_ptr3[3] * b_ptr[3];
-
-      sum0 += a_ptr0[4] * b_ptr[4];
-      sum1 += a_ptr1[4] * b_ptr[4];
-      sum2 += a_ptr2[4] * b_ptr[4];
-      sum3 += a_ptr3[4] * b_ptr[4];
-
-      sum0 += a_ptr0[5] * b_ptr[5];
-      sum1 += a_ptr1[5] * b_ptr[5];
-      sum2 += a_ptr2[5] * b_ptr[5];
-      sum3 += a_ptr3[5] * b_ptr[5];
-
-      sum0 += a_ptr0[6] * b_ptr[6];
-      sum1 += a_ptr1[6] * b_ptr[6];
-      sum2 += a_ptr2[6] * b_ptr[6];
-      sum3 += a_ptr3[6] * b_ptr[6];
-
-      sum0 += a_ptr0[7] * b_ptr[7];
-      sum1 += a_ptr1[7] * b_ptr[7];
-      sum2 += a_ptr2[7] * b_ptr[7];
-      sum3 += a_ptr3[7] * b_ptr[7];
-
-      a_ptr0 += 8;
-      a_ptr1 += 8;
-      a_ptr2 += 8;
-      a_ptr3 += 8;
-      b_ptr += 8;
-    }
-
-    for (; n < N; ++n) {
-      sum0 += a_ptr0[0] * b_ptr[0];
-      sum1 += a_ptr1[0] * b_ptr[0];
-      sum2 += a_ptr2[0] * b_ptr[0];
-      sum3 += a_ptr3[0] * b_ptr[0];
-
-      a_ptr0 += 1;
-      a_ptr1 += 1;
-      a_ptr2 += 1;
-      a_ptr3 += 1;
-      b_ptr += 1;
-    }
-    c_ptr[0] = alpha * sum0 + beta * c_ptr[0];
-    c_ptr[1] = alpha * sum1 + beta * c_ptr[1];
-    c_ptr[2] = alpha * sum2 + beta * c_ptr[2];
-    c_ptr[3] = alpha * sum3 + beta * c_ptr[3];
-  }
-
-  int m_tail_start = M - (M & 0x03);
-  for (int m = m_tail_start; m < M; ++m) {
-    const float *a_ptr = A + m * lda;
-    const float *b_ptr = B;
-    float *c_ptr = C + m;
-    float sum = 0.f;
-    for (int n = 0; n < N; n++) {
-      sum += a_ptr[0] * b_ptr[0];
-      a_ptr += 1;
-      b_ptr += 1;
-    }
-    c_ptr[0] = alpha * sum + beta * c_ptr[0];
-  }
-}
-
-void sgemv_trans_mx1(const int M, const int N, const float alpha,
-                     const float *A, const int lda, const float *B,
-                     const float beta, float *C) {
-// create buff_c to store temp computation result for each threading
-#ifdef _OPENMP
-  int threads_num = omp_get_max_threads();
-#else
-  int threads_num = 1;
-#endif  // _OPENMP
-  float *buf_c = static_cast<float *>(
-      paddle_mobile::memory::Alloc(sizeof(float) * threads_num * M));
-  memset(buf_c, 0, threads_num * M * sizeof(float));
-
-  #pragma omp parallel for
-  for (int n = 0; n < N - 3; n += 4) {
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-#else
-    const int tid = 0;
-#endif  // _OPENMP
-    float *thread_buf_c = buf_c + tid * M;
-    const float *in0 = A + n * lda;
-    const float *in1 = in0 + lda;
-    const float *in2 = in1 + lda;
-    const float *in3 = in2 + lda;
-    float32x4_t _b = vld1q_f32(B + n);
-    float32x4_t _sum0;
-    int m = 0;
-    for (; m < M - 3; m += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + m);
-      float32x4_t _r1 = vld1q_f32(in1 + m);
-      float32x4_t _r2 = vld1q_f32(in2 + m);
-      float32x4_t _r3 = vld1q_f32(in3 + m);
-      float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
-
-      _sum0 = vmulq_lane_f32(_r0, vget_low_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r1, vget_low_f32(_b), 1);
-      _sum0 = vmlaq_lane_f32(_sum0, _r2, vget_high_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r3, vget_high_f32(_b), 1);
-      _sum0 = vaddq_f32(_sum0, _vbuff_c);
-
-      vst1q_f32(thread_buf_c + m, _sum0);
-    }
-    if (m < M) {
-      float32x4_t _sum0 = vdupq_n_f32(0.0f);
-      float32x4_t _r0 = vld1q_f32(in0 + m);
-      float32x4_t _r1 = vld1q_f32(in1 + m);
-      float32x4_t _r2 = vld1q_f32(in2 + m);
-      float32x4_t _r3 = vld1q_f32(in3 + m);
-      float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
-
-      _sum0 = vmulq_lane_f32(_r0, vget_low_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r1, vget_low_f32(_b), 1);
-      _sum0 = vmlaq_lane_f32(_sum0, _r2, vget_high_f32(_b), 0);
-      _sum0 = vmlaq_lane_f32(_sum0, _r3, vget_high_f32(_b), 1);
-      _sum0 = vaddq_f32(_sum0, _vbuff_c);
-      switch (M - m) {
-        case 3:
-          vst1q_lane_f32(thread_buf_c + m + 2, _sum0, 2);
-        case 2:
-          vst1_f32(thread_buf_c + m, vget_low_f32(_sum0));
-          break;
-        case 1:
-          vst1q_lane_f32(thread_buf_c + m, _sum0, 0);
-          break;
-      }
-    }
-  }
-
-  // remain n
-  #pragma omp parallel for
-  for (int n = (N & 0xfffffffc); n < N; ++n) {
-#ifdef _OPENMP
-    const int tid = omp_get_thread_num();
-#else
-    const int tid = 0;
-#endif  // _OPENMP
-    float *thread_buf_c = buf_c + tid * M;
-    const float *in0 = A + n * lda;
-    float32x4_t _b = vld1q_dup_f32(B + n);
-    float32x4_t _sum0;
-    int m = 0;
-    for (; m < M - 3; m += 4) {
-      float32x4_t _r0 = vld1q_f32(in0 + m);
-      float32x4_t _vbuff_c = vld1q_f32(thread_buf_c + m);
-      _sum0 = vmulq_f32(_r0, _b);
-      _sum0 = vaddq_f32(_sum0, _vbuff_c);
-      vst1q_f32(thread_buf_c + m, _sum0);
-    }
-    for (; m < M; ++m) {
-      thread_buf_c[m] += in0[m] * B[n];
-    }
-  }
-
-  // reduction operate for buf_c, sum to C and do left operations
-  // y := alpha * A' * X + beta * y
-  // reduction operate: sum multi-threadings result for over-all: A' * X
-  float32x4_t _valpha = vdupq_n_f32(alpha);
-  if (beta == 0.f) {
-    #pragma omp parallel for
-    for (int m = 0; m < M - 3; m += 4) {
-      float32x4_t _sum0 = vld1q_f32(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += vld1q_f32(buf_c + tid * M + m);
-      }
-      vst1q_f32(C + m, _sum0 * _valpha);
-    }
-
-    for (int m = (M & 0xfffffffc); m < M; ++m) {
-      float _sum0 = *(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += *(buf_c + tid * M + m);
-      }
-      C[m] = _sum0 * alpha;
-    }
-  } else {  // beta != 0.f
-    float32x4_t _vbeta = vdupq_n_f32(beta);
-    #pragma omp parallel for
-    for (int m = 0; m < M - 3; m += 4) {
-      float32x4_t _sum0 = vld1q_f32(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += vld1q_f32(buf_c + tid * M + m);
-      }
-      float32x4_t _vc = vld1q_f32(C + m);
-      vst1q_f32(C + m, _sum0 * _valpha + _vbeta * _vc);
-    }
-
-    for (int m = (M & 0xfffffffc); m < M; ++m) {
-      float _sum0 = *(buf_c + m);
-      for (int tid = 1; tid < threads_num; ++tid) {
-        _sum0 += *(buf_c + tid * M + m);
-      }
-      C[m] = _sum0 * alpha + beta * C[m];
-    }
-  }
-
-  // free buff_c
-  paddle_mobile::memory::Free(buf_c);
-}
-
-void sgemv_mx1(const bool trans, const int M, const int N, const float alpha,
-               const float *A, const int lda, const float *B, const float beta,
-               float *C) {
-  if (trans) {
-    sgemv_trans_mx1(M, N, alpha, A, lda, B, beta, C);
-  } else {
-    //    sgemv_notrans_mx1(M, N, alpha, A, lda, B, beta, C);
-    sgemv_notrans_mx1_faster(M, N, alpha, A, lda, B, beta, C);
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/gemm/pack_kernel.h b/mobile/src/operators/math/gemm/pack_kernel.h
deleted file mode 100644
index d3b135961056192583afa9ff59516094437720c9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/pack_kernel.h
+++ /dev/null
@@ -1,801 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#include <arm_neon.h>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-#include "operators/math/math.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void pack_lhs_6r(const int m, const int k, const float *A, const int lda,
-                 float *output, const bool unroll) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
-  int remain_k = k & 0x3;
-  uint32x4_t vzero = vdupq_n_u32(0);
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
-
-  #pragma omp parallel for if (unroll)
-  for (int i = 0; i < m - 5; i += 6) {
-    const float *a0 = A + i * lda;
-    const float *a1 = A + (i + 1) * lda;
-    const float *a2 = A + (i + 2) * lda;
-    const float *a3 = A + (i + 3) * lda;
-    const float *a4 = A + (i + 4) * lda;
-    const float *a5 = A + (i + 5) * lda;
-    float *out_ptr = output + i * k;
-
-    int loops = k >> 2;
-    if (loops > 0) {
-#if __aarch64__
-      for (int l = 0; l < loops; ++l) {
-        float32x4_t _d0 = vld1q_f32(a0);
-        float32x4_t _d1 = vld1q_f32(a1);
-        float32x4_t _d2 = vld1q_f32(a2);
-        float32x4_t _d3 = vld1q_f32(a3);
-        float32x4_t _d4 = vld1q_f32(a4);
-        float32x4_t _d5 = vld1q_f32(a5);
-
-        float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-        float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-        float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-        _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-        _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-        _d2 =
-            vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-        _d3 =
-            vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-        vst1q_f32(out_ptr, _d0);
-        vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 6, _d1);
-        vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        vst1q_f32(out_ptr + 12, _d2);
-        vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        vst1q_f32(out_ptr + 18, _d3);
-        vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
-
-        a0 += 4;
-        a1 += 4;
-        a2 += 4;
-        a3 += 4;
-        a4 += 4;
-        a5 += 4;
-        out_ptr += 24;
-      }
-#else
-      asm volatile(
-          "loop_4k_%=:                        \n"
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-
-          "subs       %[loops], #1            \n"
-          "bne        loop_4k_%=              \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
-          :
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-
-    if (remain_k > 0) {
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      _d0 = vandq_f32_u32(_d0, vmask1);
-      _d1 = vandq_f32_u32(_d1, vmask1);
-      _d2 = vandq_f32_u32(_d2, vmask1);
-      _d3 = vandq_f32_u32(_d3, vmask1);
-      _d4 = vandq_f32_u32(_d4, vmask1);
-      _d5 = vandq_f32_u32(_d5, vmask1);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-
-      switch (remain_k) {
-        case 3:
-          vst1q_f32(out_ptr + 12, _d2);
-          vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
-        case 2:
-          vst1q_f32(out_ptr + 6, _d1);
-          vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
-        case 1:
-          vst1q_f32(out_ptr, _d0);
-          vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
-        default:
-          break;
-      }
-    }
-  }
-
-  int remain_m = m % 6;
-  if (remain_m) {
-    int remain_m_start = m - remain_m;
-    const float *a0 = A + remain_m_start * lda;
-    const float *a1 = a0 + lda;
-    const float *a2 = a0 + 2 * lda;
-    const float *a3 = a0 + 3 * lda;
-    const float *a4 = a0 + 4 * lda;
-    const float *a5 = a0 + 5 * lda;
-    float *out_ptr = output + remain_m_start * k;
-
-    uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
-    uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_m));
-    const float zerobuff[4] = {0.f, 0.f, 0.f, 0.f};
-
-    int lk = 0;
-    for (; lk < k - 3; lk += 4) {
-      switch (remain_m) {
-        case 1:
-          a1 = zerobuff;
-        case 2:
-          a2 = zerobuff;
-        case 3:
-          a3 = zerobuff;
-        case 4:
-          a4 = zerobuff;
-        case 5:
-          a5 = zerobuff;
-        default:
-          break;
-      }
-#if __aarch64__
-      float32x4_t _d0 = vld1q_f32(a0);
-      float32x4_t _d1 = vld1q_f32(a1);
-      float32x4_t _d2 = vld1q_f32(a2);
-      float32x4_t _d3 = vld1q_f32(a3);
-      float32x4_t _d4 = vld1q_f32(a4);
-      float32x4_t _d5 = vld1q_f32(a5);
-
-      float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
-      float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
-      float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
-      _d0 = vcombine_f32(vget_low_f32(_q0.val[0]), vget_low_f32(_q1.val[0]));
-      _d1 = vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
-      _d2 = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
-      _d3 = vcombine_f32(vget_high_f32(_q0.val[1]), vget_high_f32(_q1.val[1]));
-
-      _d0 = vandq_f32_u32(_d0, vmask2);
-      _d1 = vandq_f32_u32(_d1, vmask2);
-      _d2 = vandq_f32_u32(_d2, vmask2);
-      _d3 = vandq_f32_u32(_d3, vmask2);
-      _d4 = vandq_f32_u32(_q3.val[0], vmask3);
-      _d5 = vandq_f32_u32(_q3.val[1], vmask3);
-
-      vst1q_f32(out_ptr, _d0);
-      vst1_f32(out_ptr + 4, vget_low_f32(_d4));
-      vst1q_f32(out_ptr + 6, _d1);
-      vst1_f32(out_ptr + 10, vget_low_f32(_d5));
-      vst1q_f32(out_ptr + 12, _d2);
-      vst1_f32(out_ptr + 16, vget_high_f32(_d4));
-      vst1q_f32(out_ptr + 18, _d3);
-      vst1_f32(out_ptr + 22, vget_high_f32(_d5));
-
-      a0 += 4;
-      a1 += 4;
-      a2 += 4;
-      a3 += 4;
-      a4 += 4;
-      a5 += 4;
-      out_ptr += 24;
-#else
-      asm volatile(
-          "vld1.32    {d0-d1}, [%[a0]]!       \n"
-          "vld1.32    {d2-d3}, [%[a1]]!       \n"
-          "vld1.32    {d4-d5}, [%[a2]]!       \n"
-          "vld1.32    {d6-d7}, [%[a3]]!       \n"
-          "vld1.32    {d8-d9}, [%[a4]]!       \n"
-          "vld1.32    {d10-d11}, [%[a5]]!     \n"
-          "vtrn.32    q0, q1                  \n"
-          "vtrn.32    q2, q3                  \n"
-          "vtrn.32    q4, q5                  \n"
-          "vswp.32    d1, d4                  \n"
-          "vswp.32    d3, d6                  \n"
-
-          "vbif       q0, %q[vzero], %q[vmask2] \n"
-          "vbif       q1, %q[vzero], %q[vmask2] \n"
-          "vbif       q2, %q[vzero], %q[vmask2] \n"
-          "vbif       q3, %q[vzero], %q[vmask2] \n"
-          "vbif       q4, %q[vzero], %q[vmask3] \n"
-          "vbif       q5, %q[vzero], %q[vmask3] \n"
-
-          "vst1.32    {q0}, [%[out]]!         \n"
-          "vst1.32    {d8}, [%[out]]!         \n"
-          "vst1.32    {q1}, [%[out]]!         \n"
-          "vst1.32    {d10}, [%[out]]!        \n"
-          "vst1.32    {q2}, [%[out]]!         \n"
-          "vst1.32    {d9}, [%[out]]!         \n"
-          "vst1.32    {q3}, [%[out]]!         \n"
-          "vst1.32    {d11}, [%[out]]!        \n"
-          : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2] "+r"(a2),
-            [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5)
-          : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
-#endif
-    }
-    // remain k
-    switch (remain_m) {
-      case 1:
-        a1 = zerobuff;
-      case 2:
-        a2 = zerobuff;
-      case 3:
-        a3 = zerobuff;
-      case 4:
-        a4 = zerobuff;
-      case 5:
-        a5 = zerobuff;
-      default:
-        break;
-    }
-    for (; lk < k; ++lk) {
-      *out_ptr++ = *a0++;
-      *out_ptr++ = *a1++;
-      *out_ptr++ = *a2++;
-      *out_ptr++ = *a3++;
-      *out_ptr++ = *a4++;
-      *out_ptr++ = *a5++;
-    }
-  }
-}
-
-#if __aarch64__
-void pack_rhs_16c(int k, int n, const float *B, int ldb, float *output,
-                  const bool unroll) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  uint32_t remain_n = n & 0x7;
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n));
-  uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_n));
-
-  #pragma omp parallel for if (unroll)
-  for (int i = 0; i < k - 3; i += 4) {
-    const float *b0 = B + i * ldb;
-    const float *b1 = b0 + ldb;
-    const float *b2 = b1 + ldb;
-    const float *b3 = b2 + ldb;
-    int j = 0;
-    asm volatile(
-        "prfm   pldl1keep,       [%[b0]]            \n"
-        "prfm   pldl1keep,       [%[b1]]            \n"
-        "prfm   pldl1keep,       [%[b2]]            \n"
-        "prfm   pldl1keep,       [%[b3]]            \n"
-        :
-        : [b0] "r"(b0), [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3));
-
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 16 * i;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[b0]], #64  \n"
-          "ld1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[b1]], #64  \n"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[out_ptr0]], #64 \n"
-          "st1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[out_ptr0]], #64 \n"
-
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[b2]], #64  \n"
-          "ld1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[b3]], #64  \n"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},  [%[out_ptr0]], #64 \n"
-          "st1    {v4.4s, v5.4s, v6.4s, v7.4s},  [%[out_ptr0]], #64 \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1),
-            [b2] "+r"(b2), [b3] "+r"(b3)
-          :
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      int step = 64;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]], #32  \n"
-          "ld1    {v2.4s, v3.4s},  [%[b1]], #32  \n"
-          "ld1    {v4.4s, v5.4s},  [%[b2]], #32  \n"
-          "ld1    {v6.4s, v7.4s},  [%[b3]], #32  \n"
-
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]], %[step] \n"
-          "st1    {v2.4s, v3.4s},  [%[out_ptr0]], %[step] \n"
-          "st1    {v4.4s, v5.4s},  [%[out_ptr0]], %[step] \n"
-          "st1    {v6.4s, v7.4s},  [%[out_ptr0]], %[step] \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1),
-            [b2] "+r"(b2), [b3] "+r"(b3)
-          : [step] "r"(step)
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      int step = 64;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]]         \n"
-          "ld1    {v2.4s, v3.4s},  [%[b1]]         \n"
-          "ld1    {v4.4s, v5.4s},  [%[b2]]         \n"
-          "ld1    {v6.4s, v7.4s},  [%[b3]]         \n"
-
-          "and    v0.16b, v0.16b, %[vmask1].16b    \n"
-          "and    v1.16b, v1.16b, %[vmask2].16b    \n"
-          "and    v2.16b, v2.16b, %[vmask1].16b    \n"
-          "and    v3.16b, v3.16b, %[vmask2].16b    \n"
-          "and    v4.16b, v4.16b, %[vmask1].16b    \n"
-          "and    v5.16b, v5.16b, %[vmask2].16b    \n"
-          "and    v6.16b, v6.16b, %[vmask1].16b    \n"
-          "and    v7.16b, v7.16b, %[vmask2].16b    \n"
-
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]], %[step]  \n"
-          "st1    {v2.4s, v3.4s},  [%[out_ptr0]], %[step]  \n"
-          "st1    {v4.4s, v5.4s},  [%[out_ptr0]], %[step]  \n"
-          "st1    {v6.4s, v7.4s},  [%[out_ptr0]], %[step]  \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0),
-            [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3), [step] "r"(step)
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-      j += 8;
-    }
-
-    if (j & 0xf) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-      out_ptr0 += 16;
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-      out_ptr0 += 16;
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-      out_ptr0 += 16;
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-    }
-  }
-  // remain k
-  for (int i = (k & 0xFFFFFFFC); i < k; ++i) {
-    const float *b0 = B + i * ldb;
-    int j = 0;
-    asm volatile("prfm   pldl1keep,       [%[b0]]            \n"
-                 :
-                 : [b0] "r"(b0));
-
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 16 * i;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s, v2.4s, v3.4s},     [%[b0]], #64  \n"
-          "st1    {v0.4s, v1.4s, v2.4s, v3.4s},     [%[out_ptr0]], #64 \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0)
-          :
-          : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      int step = 64;
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]], #32  \n"
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]], %[step] \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0)
-          : [step] "r"(step)
-          : "memory", "v0", "v1");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      asm volatile(
-          "ld1    {v0.4s, v1.4s},  [%[b0]]          \n"
-          "and    v0.16b, v0.16b,  %[vmask1].16b    \n"
-          "and    v1.16b, v1.16b,  %[vmask2].16b    \n"
-          "st1    {v0.4s, v1.4s},  [%[out_ptr0]]    \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0)
-          : "memory", "v0", "v1");
-      j += 8;
-    }
-    if (j & 0xf) {
-      float *out_ptr0 = output + (j & 0xFFFFFFF0) * k + 16 * i + (j & 0xF);
-      vst1q_f32(out_ptr0, vzero);
-      vst1q_f32(out_ptr0 + 4, vzero);
-    }
-  }
-}
-#else
-
-void pack_rhs_8c(int k, int n, const float *B, int ldb, float *output,
-                 const bool unroll) {
-  uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  uint32_t remain_n = n & 0x7;
-  uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_n));
-  uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask + 4), vdupq_n_u32(remain_n));
-
-  #pragma omp parallel for if (unroll)
-  for (int i = 0; i < k - 3; i += 4) {
-    const float *b0 = B + i * ldb;
-    const float *b1 = b0 + ldb;
-    const float *b2 = b1 + ldb;
-    const float *b3 = b2 + ldb;
-    int j = 0;
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      float *out_ptr1 = out_ptr0 + 8 * k;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vld1.32  {q2, q3},   [%[b1]]!          \n"
-          "vld1.32  {q4, q5},   [%[b0]]!          \n"
-          "vld1.32  {q6, q7},   [%[b1]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q4, q5},   [%[out_ptr1]]!    \n"
-          "vst1.32  {q6, q7},   [%[out_ptr1]]!    \n"
-
-          "vld1.32  {q0, q1},   [%[b2]]!          \n"
-          "vld1.32  {q2, q3},   [%[b3]]!          \n"
-          "vld1.32  {q4, q5},   [%[b2]]!          \n"
-          "vld1.32  {q6, q7},   [%[b3]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q4, q5},   [%[out_ptr1]]!    \n"
-          "vst1.32  {q6, q7},   [%[out_ptr1]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [b0] "+r"(b0),
-            [b1] "+r"(b1), [b2] "+r"(b2), [b3] "+r"(b3)
-          :
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vld1.32  {q2, q3},   [%[b1]]!          \n"
-          "vld1.32  {q4, q5},   [%[b2]]!          \n"
-          "vld1.32  {q6, q7},   [%[b3]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q4, q5},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q6, q7},   [%[out_ptr0]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0), [b1] "+r"(b1),
-            [b2] "+r"(b2), [b3] "+r"(b3)
-          :
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]               \n"
-          "vld1.32  {q2, q3},   [%[b1]]               \n"
-          "vld1.32  {q4, q5},   [%[b2]]               \n"
-          "vld1.32  {q6, q7},   [%[b3]]               \n"
-          "vand     q0, q0, %q[vmask1]         \n"
-          "vand     q1, q1, %q[vmask2]         \n"
-          "vand     q2, q2, %q[vmask1]         \n"
-          "vand     q3, q3, %q[vmask2]         \n"
-          "vand     q4, q4, %q[vmask1]         \n"
-          "vand     q5, q5, %q[vmask2]         \n"
-          "vand     q6, q6, %q[vmask1]         \n"
-          "vand     q7, q7, %q[vmask2]         \n"
-
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!        \n"
-          "vst1.32  {q2, q3},   [%[out_ptr0]]!        \n"
-          "vst1.32  {q4, q5},   [%[out_ptr0]]!        \n"
-          "vst1.32  {q6, q7},   [%[out_ptr0]]!        \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0),
-            [b1] "r"(b1), [b2] "r"(b2), [b3] "r"(b3)
-          : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-    }
-  }
-  // remain k
-  for (int i = (k & 0xFFFFFFFC); i < k; ++i) {
-    const float *b0 = B + i * ldb;
-    int j = 0;
-    for (; j < n - 15; j += 16) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      float *out_ptr1 = out_ptr0 + 8 * k;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vld1.32  {q2, q3},   [%[b0]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          "vst1.32  {q2, q3},   [%[out_ptr1]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-    }
-    for (; j < n - 7; j += 8) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]!          \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]!    \n"
-          : [out_ptr0] "+r"(out_ptr0), [b0] "+r"(b0)
-          :
-          : "memory", "q0", "q1");
-    }
-    if (j < n) {
-      float *out_ptr0 = output + j * k + 8 * i;
-      asm volatile(
-          "vld1.32  {q0, q1},   [%[b0]]           \n"
-          "vand     q0, q0, %q[vmask1]            \n"
-          "vand     q1, q1, %q[vmask2]            \n"
-          "vst1.32  {q0, q1},   [%[out_ptr0]]     \n"
-          : [out_ptr0] "+r"(out_ptr0)
-          : [vmask1] "w"(vmask1), [vmask2] "w"(vmask2), [b0] "r"(b0)
-          : "memory", "q0", "q1");
-    }
-  }
-}
-#endif  // __aarch64__
-
-void write_back_alpha_beta(const int mc, const int nc, const float alpha,
-                           const float *c, const int ldc1, const float beta,
-                           float *C, const int ldc2) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  float32x4_t _alpha = vdupq_n_f32(alpha);
-  float32x4_t _beta = vdupq_n_f32(beta);
-  float32x4_t cv, cv2;
-  for (int i = 0; i < mc; ++i) {
-    const float *c_ptr = c + i * ldc1;
-    float *C_ptr = C + i * ldc2;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmulq_f32(_alpha, cv);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vmlaq_f32(cv, _beta, cv2);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv = vmulq_f32(_alpha, cv);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vmlaq_f32(cv, _beta, cv2);
-      switch (_nc1) {
-        case 3:
-          vst1q_lane_f32(C_ptr + 2, cv, 2);
-        case 2:
-          vst1_f32(C_ptr, vget_low_f32(cv));
-          break;
-        case 1:
-          vst1q_lane_f32(C_ptr, cv, 0);
-          break;
-      }
-    }
-  }
-}
-
-#if __aarch64__
-void write_back_alpha1_beta0(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  const float *c_ptr;
-  float *C_ptr;
-  float32x4_t cv;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * ldc1;
-    C_ptr = C + i * ldc2;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      switch (_nc1) {
-        case 3:
-          vst1q_lane_f32(C_ptr + 2, cv, 2);
-        case 2:
-          vst1_f32(C_ptr, vget_low_f32(cv));
-          break;
-        case 1:
-          vst1q_lane_f32(C_ptr, cv, 0);
-          break;
-      }
-    }
-  }
-}
-
-void write_back_alpha1_beta1(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 4;
-  int _nc1 = nc % 4;
-
-  const float *c_ptr;
-  float *C_ptr;
-  float32x4_t cv, cv2;
-  for (int i = 0; i < mc; ++i) {
-    c_ptr = c + i * ldc1;
-    C_ptr = C + i * ldc2;
-    for (int j = 0; j < nc1; ++j) {
-      cv = vld1q_f32(c_ptr);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv2);
-      vst1q_f32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_f32(c_ptr);
-      cv2 = vld1q_f32(C_ptr);
-      cv = vaddq_f32(cv, cv2);
-      switch (_nc1) {
-        case 3:
-          vst1q_lane_f32(C_ptr + 2, cv, 2);
-        case 2:
-          vst1_f32(C_ptr, vget_low_f32(cv));
-          break;
-        case 1:
-          vst1q_lane_f32(C_ptr, cv, 0);
-          break;
-      }
-    }
-  }
-}
-
-#else
-void write_back_alpha1_beta0(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 16;
-  int nc2 = nc % 16;
-  int step1 = 4 * (ldc1 - 16 * nc1);
-  int step2 = 4 * ldc2;
-  int volatile m = mc;
-
-  const float *volatile c_ptr = c;
-  float *volatile C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1              \n\t"
-        "blt        end_mc_%=                     \n\t"
-        "loop_mc_%=:                              \n\t"
-
-        "mov        r6,   %[C_ptr]                \n\t"
-        "mov        r5,   %[nc1]                  \n\t"
-        "subs       r5,   r5,   #1                \n\t"
-        "blt        end_nc1_%=                    \n\t"
-        "loop_nc1_%=:                             \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!         \n\t"
-        "vst1.32    {q0, q1}, [r6]!               \n\t"
-
-        "vld1.32    {q2, q3}, [%[c_ptr]]!         \n\t"
-        "vst1.32    {q2, q3}, [r6]!               \n\t"
-
-        "subs       r5,   r5,   #1                \n\t"
-        "bge        loop_nc1_%=                   \n\t"
-        "end_nc1_%=:                              \n\t"
-
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "add        %[C_ptr], %[C_ptr], %[step2]  \n\t"
-        "subs       %[mc], %[mc], #1              \n\t"
-        "bge        loop_mc_%=                    \n\t"
-        "end_mc_%=:                               \n\t"
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step1] "r"(step1), [step2] "r"(step2)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (nc2 != 0) {
-    for (int i = 0; i < mc; i++) {
-      const float *c0 = c_ptr + nc1 * 16 + i * ldc1;
-      float *C0 = C_ptr + nc1 * 16 + i * ldc2;
-      for (int j = 0; j < nc2; j++) {
-        *C0++ = *c0++;
-      }
-    }
-  }
-}
-
-void write_back_alpha1_beta1(const int mc, const int nc, const float *c,
-                             const int ldc1, float *C, const int ldc2) {
-  int nc1 = nc / 16;
-  int nc2 = nc % 16;
-  int step1 = 4 * (ldc1 - 16 * nc1);
-  int step2 = 4 * ldc2;
-  int volatile m = mc;
-
-  const float *volatile c_ptr = c;
-  float *volatile C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1              \n\t"
-        "blt        end_mc_%=                     \n\t"
-        "loop_mc_%=:                              \n\t"
-
-        "mov        r6,   %[C_ptr]                \n\t"
-        "mov        r5,   %[nc1]                  \n\t"
-        "subs       r5,   r5,   #1                \n\t"
-        "blt        end_nc1_%=                    \n\t"
-        "loop_nc1_%=:                             \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!         \n\t"
-        "vld1.32    {q2, q3}, [r6]                \n\t"
-        "vadd.f32   q0, q0, q2                    \n\t"
-        "vadd.f32   q1, q1, q3                    \n\t"
-        "vst1.32    {q0, q1}, [r6]!               \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!         \n\t"
-        "vld1.32    {q2, q3}, [r6]                \n\t"
-        "vadd.f32   q0, q0, q2                    \n\t"
-        "vadd.f32   q1, q1, q3                    \n\t"
-        "vst1.32    {q0, q1}, [r6]!               \n\t"
-
-        "subs       r5,   r5,   #1                \n\t"
-        "bge        loop_nc1_%=                   \n\t"
-        "end_nc1_%=:                              \n\t"
-
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "add        %[C_ptr], %[C_ptr], %[step2]  \n\t"
-        "subs       %[mc], %[mc], #1              \n\t"
-        "bge        loop_mc_%=                    \n\t"
-        "end_mc_%=:                               \n\t"
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
-          [step1] "r"(step1), [step2] "r"(step2)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (nc2 != 0) {
-    for (int i = 0; i < mc; i++) {
-      const float *c0 = c_ptr + nc1 * 16 + i * ldc1;
-      float *C0 = C_ptr + nc1 * 16 + i * ldc2;
-      for (int j = 0; j < nc2; j++) {
-        *C0++ += *c0++;
-      }
-    }
-  }
-}
-#endif  // __aarch64__
-
-void write_back(const int mc, const int nc, const float alpha, const float *c,
-                const int ldc1, const float beta, float *C, const int ldc2) {
-  if (alpha == 1.f && beta == 0.f) {
-    write_back_alpha1_beta0(mc, nc, c, ldc1, C, ldc2);
-  } else if (alpha == 1.f && beta == 1.f) {
-    write_back_alpha1_beta1(mc, nc, c, ldc1, C, ldc2);
-  } else {
-    write_back_alpha_beta(mc, nc, alpha, c, ldc1, beta, C, ldc2);
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/gemm/strategy.h b/mobile/src/operators/math/gemm/strategy.h
deleted file mode 100644
index 11e24fb1c31ae6a6e02422dea95de2874ccebc5f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm/strategy.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "operators/math/gemm/gemm_kernel.h"
-#include "operators/math/gemm/pack_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-struct SgemmStrategy {
-  typedef float Itype;
-  typedef float Otype;
-
-  typedef void (*packLhsFunc)(const int, const int, const Itype *, const int,
-                              Itype *, const bool);
-  typedef void (*packRhsFunc)(const int, const int, const Itype *, const int,
-                              Itype *, const bool);
-  typedef void (*kernelFunc)(const Itype *, const Itype *, const int, Otype *,
-                             const int);
-  typedef void (*WriteFunc)(const int, const int, const float alpha,
-                            const Otype *, const int, const float beta, Otype *,
-                            const int);
-
-  packLhsFunc pack_lhs;
-  packRhsFunc pack_rhs;
-  kernelFunc kernel;
-  WriteFunc write;
-
-  static int out_width() {
-#if __aarch64__
-    return 16;
-#else
-    return 8;
-#endif
-  }
-
-  static int out_height() { return 6; }
-
-  SgemmStrategy() {
-    pack_lhs = pack_lhs_6r;
-#if __aarch64__
-    pack_rhs = pack_rhs_16c;
-    kernel = sgemm_6x16;
-#else
-    pack_rhs = pack_rhs_8c;
-    kernel = sgemm_6x8;
-#endif
-    write = write_back;
-  }
-};
-
-struct I8o32gemmStrategy {
-  typedef int8_t Itype;
-  typedef int32_t Otype;
-
-  typedef void (*kern_type)(const Itype *, const Itype *, const int, Otype *,
-                            const int);
-  kern_type kernel;
-
-  static int out_width() { return 8; }
-
-  static int out_height() {
-#if __aarch64__
-    return 12;
-#else
-    return 6;
-#endif
-  }
-
-  I8o32gemmStrategy() {}
-};
-
-struct SgemvStrategy {
-  typedef float Itype;
-  typedef float Otype;
-
-  typedef void (*kernelFunc)(const bool, const int, const int, const float,
-                             const Itype *, const int, const Itype *,
-                             const float, Otype *);
-  kernelFunc kernel;
-
-  SgemvStrategy() { kernel = sgemv_mx1; }
-};
-
-struct I8o32gemvStrategy {
-  typedef int8_t Itype;
-  typedef int32_t Otype;
-
-  typedef void (*kern_type)(const Itype *, const Itype *, const int, Otype *,
-                            const int);
-  kern_type kernel;
-
-  static int out_width() { return 1; }
-
-  static int out_height() {
-#if __aarch64__
-    return 12;
-#else
-    return 6;
-#endif
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm_int8.cpp b/mobile/src/operators/math/gemm_int8.cpp
deleted file mode 100644
index 19a5b88cbe28c79d3faa91202cb1cf69eac83ee4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm_int8.cpp
+++ /dev/null
@@ -1,2077 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string.h>
-#include "common/log.h"
-#include "operators/math/gemm.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#include <iostream>
-
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// AddDot4x8 used only for aarch32
-#else
-  const int8_t *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int32_t kc1 = k >> 3;
-  int32_t kc2 = k & 7;
-  int32_t kc3 = kc2 >> 2;
-  int32_t kc4 = kc2 & 3;
-  int32_t kc5 = kc4 >> 1;
-  int32_t kc6 = kc4 & 1;
-  int32_t step = sizeof(int32_t) * ldc;
-  asm volatile(
-      // q8-q15: save 32 results
-      "pld          [%[a_ptr]]                     \n\t"
-      "pld          [%[b_ptr]]                     \n\t"
-      "pld          [%[b_ptr], #64]                \n\t"
-      "vmov.s32     q8,         #0                 \n\t"
-      "vmov.s32     q9,         q8                 \n\t"
-      "vmov.s32     q10,        q8                 \n\t"
-      "vmov.s32     q11,        q8                 \n\t"
-      "vmov.s32     q12,        q8                 \n\t"
-      "vmov.s32     q13,        q8                 \n\t"
-      "vmov.s32     q14,        q8                 \n\t"
-      "vmov.s32     q15,        q8                 \n\t"
-      "subs         %[kc1],     %[kc1],       #1   \n\t"
-      "blt          1f                             \n\t"
-      "0:                                          \n\t"
-      "pld          [%[a_ptr], #64]                \n\t"
-      "pld          [%[b_ptr], #128]               \n\t"
-      "vld1.s8      {d0-d3},    [%[a_ptr]]!        \n\t"  // load A 8 cols
-      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B first 4 rows
-      "vmovl.s8     q2,         d0                 \n\t"  // process B first
-                                                          // rows
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d9                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vld1.s8      {d12-d15},  [%[b_ptr]]!        \n\t"  // load B second 4
-                                                          // rows
-      "vmovl.s8     q2,         d1                 \n\t"
-      "vmovl.s8     q3,         d10                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d11                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                 \n\t"  // process B second 4
-                                                          // rows
-      "vmovl.s8     q3,         d12                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d13                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d3                 \n\t"
-      "vmovl.s8     q3,         d14                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d15                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "subs         %[kc1],     %[kc1],        #1  \n\t"
-      "bge          0b                             \n\t"
-      "1:                                          \n\t"  // last 4 rows
-      "subs         %[kc3],     %[kc3],        #1  \n\t"
-      "blt          2f                             \n\t"
-      "vld1.s8      {d0-d1},    [%[a_ptr]]!        \n\t"  // load A 4 cols
-      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B 4 rows
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d9                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                 \n\t"
-      "vmovl.s8     q3,         d10                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d11                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "2:                                          \n\t"  // last 2 rows
-      "subs         %[kc5],     %[kc5],        #1  \n\t"
-      "blt          3f                             \n\t"
-      "vld1.s8      {d0},       [%[a_ptr]]!        \n\t"  // load A 2 cols
-      "vld1.s8      {d8-d9},    [%[b_ptr]]!        \n\t"  // load B 2 rows
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vmovl.s8     q3,         d9                 \n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-      "3:                                          \n\t"  // last 1 row
-      "subs         %[kc6],     %[kc6],        #1  \n\t"
-      "blt          4f                             \n\t"
-      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"  // load A 1 col
-      "vld1.s8      {d8},       [%[b_ptr]]        \n\t"   // load B 1 row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d8                 \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "4:                                          \n\t"
-      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q14, q15}, [%[c]]             \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// The core idea of AddDot4x2 and AddDot4x4 function is borrowed from the
-// Google's gemmlowp open source library. The address of gemmlowp is
-// https://github.com/google/gemmlowp.
-void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// AddDot4x2 used only for aarch32
-#else
-#define PADDLE_LABEL_LOOP "1"
-#define PADDLE_LABEL_AFTER_LOOP "2"
-  asm volatile(
-      "lsl %[ldc], %[ldc], #2 \n\t"  // sizeof(int32) == 4
-      "vldr d0, [%[b], #0] \n\t"
-      "vmov.s32 q8, #0 \n\t"
-      "vldr d4, [%[a], #0] \n\t"
-      "vmov.s32 q9, q8 \n\t"
-      "vldr d2, [%[b], #16] \n\t"
-      "vmov.s32 q10, q8 \n\t"
-      "vldr d6, [%[a], #16] \n\t"
-      "vmov.s32 q11, q8 \n\t"
-      "vldr d1, [%[b], #8]\n\t"
-      "vmov.s32 q12, q8 \n\t"
-      "vldr d5, [%[a], #8]\n"
-      "vmov.s32 q13, q8 \n\t"
-      "vldr d3, [%[b], #24]\n\t"
-      "vmov.s32 q14, q8 \n\t"
-      "vldr d7, [%[a], #24]\n"
-      "vmov.s32 q15, q8 \n\t"
-
-      PADDLE_LABEL_LOOP
-      ": \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "add %[b], %[b], #32 \n\t"
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vldr d4, [%[a], #32] \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vldr d6, [%[a], #48] \n\t"
-
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vldr d5, [%[a], #40] \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vldr d7, [%[a], #56] \n\t"
-
-      "vpadal.s16   q8,  q4 \n\t"  // pairwise-add
-      "add %[a], %[a], #64 \n\t"
-      "vpadal.s16   q9,  q5 \n\t"
-      "subs %[k], %[k], #16 \n\t"
-      "vpadal.s16   q10, q6 \n\t"
-      "vpadal.s16   q11, q7 \n\t"
-
-      "beq " PADDLE_LABEL_AFTER_LOOP
-      "f \n\t"
-
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vldr d4, [%[a], #0] \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vldr d0, [%[b], #0] \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vldr d2, [%[b], #16] \n\t"
-
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vldr d6, [%[a], #16] \n\t"
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vldr d5, [%[a], #8] \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vldr d1, [%[b], #8] \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vldr d3, [%[b], #24] \n\t"
-
-      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
-      "vldr d7, [%[a], #24] \n\t"
-      "vpadal.s16   q13, q5 \n\t"
-      "vpadal.s16   q14, q6 \n\t"
-      "vpadal.s16   q15, q7 \n\t"
-
-      "b " PADDLE_LABEL_LOOP "b \n\t"
-
-      PADDLE_LABEL_AFTER_LOOP
-      ": \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-
-      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
-      "vpadal.s16   q13, q5 \n\t"
-      "vpadal.s16   q14, q6 \n\t"
-      "vpadal.s16   q15, q7 \n\t"
-
-      "vpadd.s32 d0, d16, d17 \n\t"  // reduce to int32
-      "vpadd.s32 d1, d18, d19 \n\t"
-      "vpadd.s32 d2, d20, d21 \n\t"
-      "vpadd.s32 d3, d22, d23 \n\t"
-      "vpadd.s32 d4, d24, d25 \n\t"
-      "vpadd.s32 d5, d26, d27 \n\t"
-      "vpadd.s32 d6, d28, d29 \n\t"
-      "vpadd.s32 d7, d30, d31 \n\t"
-
-      "vpadd.s32 d8, d0, d1 \n\t"  // reduce to int32 again
-      "vpadd.s32 d9, d2, d3 \n\t"
-      "vpadd.s32 d10, d4, d5 \n\t"
-      "vpadd.s32 d11, d6, d7 \n\t"
-
-      "vst1.32 {d8}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d9}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d10}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d11}, [%[c]]  \n\t"
-
-      : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)
-      : [ldc] "r"(ldc)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#undef PADDLE_LABEL_AFTER_LOOP
-#undef PADDLE_LABEL_LOOP
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-void Gemm::AddDot4x4(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-#define PADDLE_LABEL_LOOP "1"
-#define PADDLE_LABEL_AFTER_LOOP "2"
-  asm volatile(
-      // load data from matrix a and b，and set zero to result register
-      "ld1 {v0.16b}, [%[b]], #16\n"
-      "dup v16.4s, wzr\n"
-      "ld1 {v4.16b}, [%[a]], #16\n"
-      "dup v17.4s, wzr\n"
-      "ld1 {v1.16b}, [%[b]], #16\n"
-      "dup v18.4s, wzr\n"
-      "ld1 {v5.16b}, [%[a]], #16\n"
-      "dup v19.4s, wzr\n"
-      "ld1 {v2.16b}, [%[b]], #16\n"
-      "dup v20.4s, wzr\n"
-      "ld1 {v3.16b}, [%[b]], #16\n"
-      "dup v21.4s, wzr\n"
-      "ld1 {v6.16b}, [%[a]], #16\n"
-      "dup v22.4s, wzr\n"
-      "ld1 {v7.16b}, [%[a]], #16\n"
-      "dup v23.4s, wzr\n"
-      "dup v24.4s, wzr\n"
-      "dup v25.4s, wzr\n"
-      "dup v26.4s, wzr\n"
-      "dup v27.4s, wzr\n"
-      "dup v28.4s, wzr\n"
-      "dup v29.4s, wzr\n"
-      "dup v30.4s, wzr\n"
-      "dup v31.4s, wzr\n"
-
-      // Multiply ldc by 4 == sizeof(int32)
-      "lsl %[ldc], %[ldc], #2\n"
-
-      // first half
-      "smull    v8.8h,  v0.8b,  v4.8b\n"
-      "smull    v9.8h,  v1.8b,  v4.8b\n"
-      "smull    v10.8h,  v2.8b,  v4.8b\n"
-      "smull    v11.8h,  v3.8b,  v4.8b\n"
-      "smull    v12.8h,  v0.8b,  v5.8b\n"
-      "smull    v13.8h,  v1.8b,  v5.8b\n"
-      "smull    v14.8h,  v2.8b,  v5.8b\n"
-      "smull    v15.8h,  v3.8b,  v5.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v4.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v4.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v4.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v4.16b\n"
-      "smlal2   v12.8h,  v0.16b,  v5.16b\n"
-      "smlal2   v13.8h,  v1.16b,  v5.16b\n"
-      "smlal2   v14.8h,  v2.16b,  v5.16b\n"
-      "smlal2   v15.8h,  v3.16b,  v5.16b\n"
-
-      "subs %[k], %[k], #16\n"
-
-      // skip the loop
-      "beq " PADDLE_LABEL_AFTER_LOOP "f\n"
-
-      // loop
-      PADDLE_LABEL_LOOP
-      ":\n"
-
-      // first half
-      "sadalp  v16.4s, v8.8h\n"
-      "ld1 {v4.16b}, [%[a]], #16\n"
-      "smull    v8.8h,  v0.8b,  v6.8b\n"
-      "sadalp  v17.4s, v9.8h\n"
-      "ld1 {v5.16b}, [%[a]], #16\n"
-      "smull    v9.8h,  v1.8b,  v6.8b\n"
-      "sadalp  v18.4s, v10.8h\n"
-      "smull    v10.8h,  v2.8b,  v6.8b\n"
-      "sadalp  v19.4s, v11.8h\n"
-      "smull    v11.8h,  v3.8b,  v6.8b\n"
-      "sadalp  v20.4s, v12.8h\n"
-      "smull    v12.8h,  v0.8b,  v7.8b\n"
-      "sadalp  v21.4s, v13.8h\n"
-      "smull    v13.8h,  v1.8b,  v7.8b\n"
-      "sadalp  v22.4s, v14.8h\n"
-      "smull    v14.8h,  v2.8b,  v7.8b\n"
-      "sadalp  v23.4s, v15.8h\n"
-      "smull    v15.8h,  v3.8b,  v7.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v6.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v6.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v6.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v6.16b\n"
-
-      "ld1 {v6.16b}, [%[a]], #16\n"
-
-      "smlal2   v12.8h,  v0.16b,  v7.16b\n"
-      "ld1 {v0.16b}, [%[b]], #16\n"
-      "smlal2   v13.8h,  v1.16b,  v7.16b\n"
-      "ld1 {v1.16b}, [%[b]], #16\n"
-      "smlal2   v14.8h,  v2.16b,  v7.16b\n"
-      "ld1 {v2.16b}, [%[b]], #16\n"
-      "smlal2   v15.8h,  v3.16b,  v7.16b\n"
-      "ld1 {v3.16b}, [%[b]], #16\n"
-
-      // first half
-      "sadalp  v24.4s, v8.8h\n"
-      "smull    v8.8h,  v0.8b,  v4.8b\n"
-      "sadalp  v25.4s, v9.8h\n"
-      "ld1 {v7.16b}, [%[a]], #16\n"
-      "smull    v9.8h,  v1.8b,  v4.8b\n"
-      "sadalp  v26.4s, v10.8h\n"
-      "smull    v10.8h,  v2.8b,  v4.8b\n"
-      "sadalp  v27.4s, v11.8h\n"
-      "smull    v11.8h,  v3.8b,  v4.8b\n"
-      "sadalp  v28.4s, v12.8h\n"
-      "smull    v12.8h,  v0.8b,  v5.8b\n"
-      "sadalp  v29.4s, v13.8h\n"
-      "smull    v13.8h,  v1.8b,  v5.8b\n"
-      "sadalp  v30.4s, v14.8h\n"
-      "smull    v14.8h,  v2.8b,  v5.8b\n"
-      "sadalp  v31.4s, v15.8h\n"
-      "smull    v15.8h,  v3.8b,  v5.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v4.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v4.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v4.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v4.16b\n"
-
-      // Loop
-      "subs %[k], %[k], #16\n"
-
-      "smlal2   v12.8h,  v0.16b,  v5.16b\n"
-      "smlal2   v13.8h,  v1.16b,  v5.16b\n"
-      "smlal2   v14.8h,  v2.16b,  v5.16b\n"
-      "smlal2   v15.8h,  v3.16b,  v5.16b\n"
-
-      "bne " PADDLE_LABEL_LOOP "b\n"
-
-      // Final
-      PADDLE_LABEL_AFTER_LOOP
-      ":\n"
-
-      // first half
-      "sadalp  v16.4s, v8.8h\n"
-      "smull    v8.8h,  v0.8b,  v6.8b\n"
-      "sadalp  v17.4s, v9.8h\n"
-      "smull    v9.8h,  v1.8b,  v6.8b\n"
-      "sadalp  v18.4s, v10.8h\n"
-      "smull    v10.8h,  v2.8b,  v6.8b\n"
-      "sadalp  v19.4s, v11.8h\n"
-      "smull    v11.8h,  v3.8b,  v6.8b\n"
-      "sadalp  v20.4s, v12.8h\n"
-      "smull    v12.8h,  v0.8b,  v7.8b\n"
-      "sadalp  v21.4s, v13.8h\n"
-      "smull    v13.8h,  v1.8b,  v7.8b\n"
-      "sadalp  v22.4s, v14.8h\n"
-      "smull    v14.8h,  v2.8b,  v7.8b\n"
-      "sadalp  v23.4s, v15.8h\n"
-      "smull    v15.8h,  v3.8b,  v7.8b\n"
-
-      // Multiply-accumulate second-half
-      "smlal2   v8.8h,  v0.16b,  v6.16b\n"
-      "smlal2   v9.8h,  v1.16b,  v6.16b\n"
-      "smlal2   v10.8h,  v2.16b,  v6.16b\n"
-      "smlal2   v11.8h,  v3.16b,  v6.16b\n"
-      "smlal2   v12.8h,  v0.16b,  v7.16b\n"
-      "smlal2   v13.8h,  v1.16b,  v7.16b\n"
-      "smlal2   v14.8h,  v2.16b,  v7.16b\n"
-      "smlal2   v15.8h,  v3.16b,  v7.16b\n"
-
-      "sadalp  v24.4s, v8.8h\n"
-      "sadalp  v25.4s, v9.8h\n"
-      "sadalp  v26.4s, v10.8h\n"
-      "sadalp  v27.4s, v11.8h\n"
-      "sadalp  v28.4s, v12.8h\n"
-      "sadalp  v29.4s, v13.8h\n"
-      "sadalp  v30.4s, v14.8h\n"
-      "sadalp  v31.4s, v15.8h\n"
-
-      // Reduce 32bit accumulators horizontally.
-      "addp v0.4s, v16.4s, v17.4s\n"
-      "addp v1.4s, v18.4s, v19.4s\n"
-      "addp v2.4s, v20.4s, v21.4s\n"
-      "addp v3.4s, v22.4s, v23.4s\n"
-      "addp v4.4s, v24.4s, v25.4s\n"
-      "addp v5.4s, v26.4s, v27.4s\n"
-      "addp v6.4s, v28.4s, v29.4s\n"
-      "addp v7.4s, v30.4s, v31.4s\n"
-
-      // Reduce 32bit accumulators horizontally, second pass
-      // (each pass adds pairwise. we need to add 4-wise).
-      "addp v12.4s, v0.4s, v1.4s\n"
-      "addp v13.4s, v2.4s, v3.4s\n"
-      "addp v14.4s, v4.4s, v5.4s\n"
-      "addp v15.4s, v6.4s, v7.4s\n"
-
-      "st1 {v12.4s}, [%[c]], %[ldc] \n\t"
-      "st1 {v13.4s}, [%[c]], %[ldc] \n\t"
-      "st1 {v14.4s}, [%[c]], %[ldc] \n\t"
-      "st1 {v15.4s}, [%[c]]  \n\t"
-
-      : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)  // outputs
-      : [ldc] "r"(ldc)                                      // inputs
-      : "cc", "memory", "x0", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-        "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17",
-        "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
-        "v28", "v29", "v30", "v31");  // clobbers
-#undef PADDLE_LABEL_AFTER_LOOP
-#undef PADDLE_LABEL_LOOP
-#else
-// AddDot4x2 used only for aarch64
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// 8 bits int small block inner product
-void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// AddDot6x8 used only for aarch32
-#else
-  const int8_t *a_ptr, *b_ptr;
-  a_ptr = a;
-  b_ptr = b;
-  int32_t kc1 = k >> 3;
-  int32_t kc2 = k & 7;
-  int32_t kc3 = kc2 >> 2;
-  int32_t kc4 = kc2 & 3;
-  int32_t kc5 = kc4 >> 1;
-  int32_t kc6 = kc4 & 1;
-  int32_t step = sizeof(int32_t) * ldc;
-  asm volatile(
-      // q4-q15: save 48 results
-      "pld          [%[a_ptr]]                     \n\t"
-      "pld          [%[b_ptr]]                     \n\t"
-      "pld          [%[b_ptr], #64]                \n\t"
-      "vmov.s32     q4,         #0                 \n\t"
-      "vmov.s32     q5,         q4                 \n\t"
-      "vmov.s32     q6,         q4                 \n\t"
-      "vmov.s32     q7,         q4                 \n\t"
-      "vmov.s32     q8,         q4                 \n\t"
-      "vmov.s32     q9,         q4                 \n\t"
-      "vmov.s32     q10,        q4                 \n\t"
-      "vmov.s32     q11,        q4                 \n\t"
-      "vmov.s32     q12,        q4                 \n\t"
-      "vmov.s32     q13,        q4                 \n\t"
-      "vmov.s32     q14,        q4                 \n\t"
-      "vmov.s32     q15,        q4                 \n\t"
-      "mov r0,      #12                            \n\t"
-      "subs         %[kc1],     %[kc1],       #1   \n\t"
-      "blt          1f                             \n\t"
-      "0:                                          \n\t"
-      "pld          [%[a_ptr], #64]                \n\t"
-      "pld          [%[b_ptr], #128]               \n\t"
-      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[0]\n\t"
-      "vmlal.s16    q5,         d7,            d5[0]\n\t"
-      "vmlal.s16    q6,         d6,            d5[1]\n\t"
-      "vmlal.s16    q7,         d7,            d5[1]\n\t"
-      "vmlal.s16    q8,         d6,            d5[2]\n\t"
-      "vmlal.s16    q9,         d7,            d5[2]\n\t"
-      "vmlal.s16    q10,        d6,            d5[3]\n\t"
-      "vmlal.s16    q11,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                  \n\t"
-      "vmlal.s16    q12,        d6,            d4[0]\n\t"
-      "vmlal.s16    q13,        d7,            d4[0]\n\t"
-      "vmlal.s16    q14,        d6,            d4[1]\n\t"
-      "vmlal.s16    q15,        d7,            d4[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d4[2]\n\t"
-      "vmlal.s16    q5,         d7,            d4[2]\n\t"
-      "vmlal.s16    q6,         d6,            d4[3]\n\t"
-      "vmlal.s16    q7,         d7,            d4[3]\n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[0]\n\t"
-      "vmlal.s16    q5,         d7,            d5[0]\n\t"
-      "vmlal.s16    q6,         d6,            d5[1]\n\t"
-      "vmlal.s16    q7,         d7,            d5[1]\n\t"
-      "vmlal.s16    q8,         d6,            d5[2]\n\t"
-      "vmlal.s16    q9,         d7,            d5[2]\n\t"
-      "vmlal.s16    q10,        d6,            d5[3]\n\t"
-      "vmlal.s16    q11,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                  \n\t"
-      "vmlal.s16    q12,        d6,            d4[0]\n\t"
-      "vmlal.s16    q13,        d7,            d4[0]\n\t"
-      "vmlal.s16    q14,        d6,            d4[1]\n\t"
-      "vmlal.s16    q15,        d7,            d4[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d4[2]\n\t"
-      "vmlal.s16    q5,         d7,            d4[2]\n\t"
-      "vmlal.s16    q6,         d6,            d4[3]\n\t"
-      "vmlal.s16    q7,         d7,            d4[3]\n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "subs         %[kc1],     %[kc1],        #1  \n\t"
-      "bge          0b                             \n\t"
-      "1:                                          \n\t"  // last <8 rows
-      "subs         %[kc3],     %[kc3],        #1  \n\t"
-      "blt          2f                             \n\t"
-      "vld1.s8      {d0-d2},    [%[a_ptr]]!        \n\t"  // A 4 cols
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 3th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[0]\n\t"
-      "vmlal.s16    q5,         d7,            d5[0]\n\t"
-      "vmlal.s16    q6,         d6,            d5[1]\n\t"
-      "vmlal.s16    q7,         d7,            d5[1]\n\t"
-      "vmlal.s16    q8,         d6,            d5[2]\n\t"
-      "vmlal.s16    q9,         d7,            d5[2]\n\t"
-      "vmlal.s16    q10,        d6,            d5[3]\n\t"
-      "vmlal.s16    q11,        d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d2                  \n\t"
-      "vmlal.s16    q12,        d6,            d4[0]\n\t"
-      "vmlal.s16    q13,        d7,            d4[0]\n\t"
-      "vmlal.s16    q14,        d6,            d4[1]\n\t"
-      "vmlal.s16    q15,        d7,            d4[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 4th row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d4[2]\n\t"
-      "vmlal.s16    q5,         d7,            d4[2]\n\t"
-      "vmlal.s16    q6,         d6,            d4[3]\n\t"
-      "vmlal.s16    q7,         d7,            d4[3]\n\t"
-      "vmlal.s16    q8,         d6,            d5[0]\n\t"
-      "vmlal.s16    q9,         d7,            d5[0]\n\t"
-      "vmlal.s16    q10,        d6,            d5[1]\n\t"
-      "vmlal.s16    q11,        d7,            d5[1]\n\t"
-      "vmlal.s16    q12,        d6,            d5[2]\n\t"
-      "vmlal.s16    q13,        d7,            d5[2]\n\t"
-      "vmlal.s16    q14,        d6,            d5[3]\n\t"
-      "vmlal.s16    q15,        d7,            d5[3]\n\t"
-
-      "2:                                          \n\t"  // last <4 rows
-      "subs         %[kc5],     %[kc5],        #1  \n\t"
-      "blt          3f                             \n\t"
-      "vld1.s8      {d0, d1},   [%[a_ptr]],    r0  \n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!        \n\t"  // B 1st row
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]!         \n\t"  // B 2nd row
-      "vmovl.s8     q3,         d3                  \n\t"
-      "vmlal.s16    q4,         d6,            d5[2]\n\t"
-      "vmlal.s16    q5,         d7,            d5[2]\n\t"
-      "vmlal.s16    q6,         d6,            d5[3]\n\t"
-      "vmlal.s16    q7,         d7,            d5[3]\n\t"
-      "vmovl.s8     q2,         d1                  \n\t"
-      "vmlal.s16    q8,         d6,            d4[0]\n\t"
-      "vmlal.s16    q9,         d7,            d4[0]\n\t"
-      "vmlal.s16    q10,        d6,            d4[1]\n\t"
-      "vmlal.s16    q11,        d7,            d4[1]\n\t"
-      "vmlal.s16    q12,        d6,            d4[2]\n\t"
-      "vmlal.s16    q13,        d7,            d4[2]\n\t"
-      "vmlal.s16    q14,        d6,            d4[3]\n\t"
-      "vmlal.s16    q15,        d7,            d4[3]\n\t"
-      "3:                                          \n\t"  // last <2 rows
-      "subs         %[kc6],     %[kc6],        #1  \n\t"
-      "blt          4f                             \n\t"
-      "vld1.s8      {d0},       [%[a_ptr]]         \n\t"
-      "vld1.s8      {d3},       [%[b_ptr]]         \n\t"
-      "vmovl.s8     q2,         d0                 \n\t"
-      "vmovl.s8     q3,         d3                 \n\t"
-      "vmlal.s16    q4,         d6,            d4[0]\n\t"
-      "vmlal.s16    q5,         d7,            d4[0]\n\t"
-      "vmlal.s16    q6,         d6,            d4[1]\n\t"
-      "vmlal.s16    q7,         d7,            d4[1]\n\t"
-      "vmlal.s16    q8,         d6,            d4[2]\n\t"
-      "vmlal.s16    q9,         d7,            d4[2]\n\t"
-      "vmlal.s16    q10,        d6,            d4[3]\n\t"
-      "vmlal.s16    q11,        d7,            d4[3]\n\t"
-      "vmlal.s16    q12,        d6,            d5[0]\n\t"
-      "vmlal.s16    q13,        d7,            d5[0]\n\t"
-      "vmlal.s16    q14,        d6,            d5[1]\n\t"
-      "vmlal.s16    q15,        d7,            d5[1]\n\t"
-      "4:                                          \n\t"
-      "vst1.32      {q4, q5},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q6, q7},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q8, q9},   [%[c]],   %[step]  \n\t"
-      "vst1.32      {q10, q11}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q12, q13}, [%[c]],   %[step]  \n\t"
-      "vst1.32      {q14, q15}, [%[c]]             \n\t"
-      :
-      : [a_ptr] "r"(a_ptr), [b_ptr] "r"(b_ptr), [c] "r"(c), [kc1] "r"(kc1),
-        [kc3] "r"(kc3), [kc5] "r"(kc5), [kc6] "r"(kc6), [step] "r"(step)
-      : "cc", "memory", "r0", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-        "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// 8 bits int inner product
-template <>
-void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                       const int8_t *b, float beta, int32_t *c, int8_t *C,
-                       int32_t ldc, bool relu) {}
-template <>
-void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
-                       const int8_t *b, float beta, int32_t *c, int32_t *C,
-                       int32_t ldc, bool relu) {
-#pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR_INT8) {
-    for (int32_t i = 0; i < mc; i += MR_INT8) {
-#if __aarch64__
-      AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif  // __aarch64__
-    }
-  }
-  if (!relu) {
-    WriteBasic(mc, nc, c, C, ldc);
-    return;
-  }
-}
-
-template <>
-void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
-                               const int8_t *a, const int8_t *b, float beta,
-                               int32_t *c, int8_t *C, int32_t ldc, bool relu,
-                               int32_t *bias, bool addOnRow) {
-#pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR_INT8) {
-    for (int32_t i = 0; i < mc; i += MR_INT8) {
-#if __aarch64__
-      AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#else
-      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif  // __aarch64__
-    }
-  }
-  if (relu) {
-    WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
-    return;
-  } else {
-    if (addOnRow) {
-      WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha);
-    } else {
-      WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
-    }
-  }
-}
-
-template <>
-void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
-                               const int8_t *a, const int8_t *b, float beta,
-                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
-                               int32_t *bias, bool addOnRow) {}
-
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                          int32_t lda, int8_t *buffer) {
-  const int8_t *a0, *a1, *a2, *a3;
-  for (int32_t i = 0; i < m - m_tail; i += 4) {
-    a0 = A + i * lda;
-    a1 = A + (i + 1) * lda;
-    a2 = A + (i + 2) * lda;
-    a3 = A + (i + 3) * lda;
-    for (int32_t j = 0; j < k; ++j) {
-      *buffer++ = *a0++;
-      *buffer++ = *a1++;
-      *buffer++ = *a2++;
-      *buffer++ = *a3++;
-    }
-  }
-
-  if (m_tail != 0) {
-    a0 = &A(m - m_tail, 0);
-    a1 = a0 + lda;
-    a2 = a0 + 2 * lda;
-    a3 = a0 + 3 * lda;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int j = 0; j < k; ++j) {
-      *buffer++ = *a0++;
-      *buffer++ = *a1++;
-      *buffer++ = *a2++;
-      *buffer++ = *a3++;
-    }
-  }
-}
-
-// 8 bits int PackMatrixA_6r
-void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                          int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  for (int32_t i = 0; i < i_length; i += 6) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    const int8_t *a4 = A + (i + 4) * lda;
-    const int8_t *a5 = A + (i + 5) * lda;
-    int8_t *local_buffer = buffer + i * k;
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-      *local_buffer++ = *a4++;
-      *local_buffer++ = *a5++;
-    }
-  }
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    const int8_t *a4 = a0 + 4 * lda;
-    const int8_t *a5 = a0 + 5 * lda;
-    int8_t *local_buffer = buffer + i_length * k;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-      case 4:
-        a4 = zero_int8;
-      case 5:
-        a5 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-      *local_buffer++ = *a4++;
-      *local_buffer++ = *a5++;
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                          int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  for (int32_t j = 0; j < j_length; j += 8) {
-    int8_t *local_buffer = buffer + j * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j);
-#if __ARM_NEON
-#if __aarch64__
-// PackMatrixB_8c used only for aarch32
-#else
-      asm volatile(
-          //          "pld        [%[b0]]                     \n\t"
-          "vld1.s8    {d0},   [%[b0]]         \n\t"
-          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "q0");
-#endif  // __aarch64__
-#else
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-#endif  // __ARM_NEON
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j_length);
-      for (int32_t j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int32_t j = n; j < j_length + 8; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                             const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * KC;
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * KC;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-  for (int32_t j = 0; j < j_length; j += 2) {
-    int8_t *local_buffer = buffer + j * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j);
-      const int8_t *b1 = &B((i << 4), j + 1);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j);
-      const int8_t *b1 = &B((k_count << 4), j + 1);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j_length);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = 0;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j_length);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-void Gemm::PackMatrixB_4c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-  for (int32_t j = 0; j < n; j += 4) {
-    int8_t *local_buffer = buffer + j * KC;
-    const int8_t *b0 = &B(0, j);
-    const int8_t *b1 = b0 + 1;
-    const int8_t *b2 = b0 + 2;
-    const int8_t *b3 = b0 + 3;
-    if (j > j_length) {
-      switch (n_tail) {
-        case 1:
-          b1 = zero_int8;
-        case 2:
-          b2 = zero_int8;
-        case 3:
-          b3 = zero_int8;
-          break;
-        default:
-          break;
-      }
-    }
-
-    for (int32_t i = 0; i < k_count; ++i) {
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-//  8 bits int write back
-// C = A * B
-void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                      int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc / 4;
-  int32_t _nc1 = nc % 4;
-
-  int32_t *c_ptr, *C_ptr;
-  int32x4_t cv;
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv = vld1q_s32(c_ptr);
-      vst1q_s32(C_ptr, cv);
-      c_ptr += 4;
-      C_ptr += 4;
-    }
-    if (_nc1 != 0) {
-      cv = vld1q_s32(c_ptr);
-      if (_nc1 >= 1) {
-        vst1q_lane_s32(C_ptr, cv, 0);
-        C_ptr++;
-      }
-      if (_nc1 >= 2) {
-        vst1q_lane_s32(C_ptr, cv, 1);
-        C_ptr++;
-      }
-      if (_nc1 >= 3) {
-        vst1q_lane_s32(C_ptr, cv, 2);
-      }
-    }
-  }
-#else
-  int32_t nc1 = nc >> 4;
-  int32_t _nc1 = nc & 15;
-  int32_t step = sizeof(int32_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile C_ptr;
-  int32_t *C0, *c0;
-  c_ptr = c;
-  C_ptr = C;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "loop_mc_%=:                        \n\t"
-
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q0, q1}, [r6]!         \n\t"
-
-        "vld1.32    {q2, q3}, [%[c_ptr]]!   \n\t"
-        "vst1.32    {q2, q3}, [r6]!         \n\t"
-
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]   \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1]  \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1)
-        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
-  }
-
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 16 + i * ldc;
-      c0 = c_ptr + nc1 * 16 + i * NC;
-      for (int32_t j = 0; j < _nc1; j++) {
-        *C0++ = *c0++;
-      }
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// C = A * B + bias, scale * C, bias is added on column
-void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                             int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-
-  int32_t *c_ptr;
-  int8_t *C_ptr;
-  int32x4_t cv0;
-  int32x4_t cv1;
-  int16x8_t cv_h;
-  int8x8_t cv_b;
-  int32x4_t biasv;
-  int8_t min = -127;
-  int8x8_t minv = vdup_n_s8(min);
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_s32(bias + i);
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-      vst1_s8(C_ptr, cv_b);
-      c_ptr += 8;
-      C_ptr += 8;
-    }
-    if (_nc1 != 0) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-
-      switch (_nc1) {
-        case 7:
-          vst1_lane_s8(C_ptr + 6, cv_b, 6);
-        case 6:
-          vst1_lane_s8(C_ptr + 5, cv_b, 5);
-        case 5:
-          vst1_lane_s8(C_ptr + 4, cv_b, 4);
-        case 4:
-          vst1_lane_s8(C_ptr + 3, cv_b, 3);
-        case 3:
-          vst1_lane_s8(C_ptr + 2, cv_b, 2);
-        case 2:
-          vst1_lane_s8(C_ptr + 1, cv_b, 1);
-        case 1:
-          vst1_lane_s8(C_ptr, cv_b, 0);
-        default:
-          break;
-      }
-    }
-  }
-#else
-  int8_t narrow = -128;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.8     d24,  %[narrow]         \n\t"
-        "loop_mc_%=:                        \n\t"
-        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
-        "vdup.32    q13,  d26[0]            \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q13             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vceq.s8    d15, d14, d24           \n\t"
-        "vsub.s8    d14, d14, d15           \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [narrow] "r"(narrow)
-        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-          "q7", "q12", "q13", "q15");
-  }
-
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t bias_v;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      bias_v = *(bias_ptr + i);
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.8     d24,  %[narrow]         \n\t"
-          "vdup.32    q13,  %[bias_v]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// C = A * B + bias, scale * C, bias is added on row
-void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                              int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-
-  int32_t *c_ptr;
-  int8_t *C_ptr;
-  int32x4_t cv0;
-  int32x4_t cv1;
-  int16x8_t cv_h;
-  int8x8_t cv_b;
-  int32_t *bias_ptr;
-  int32x4_t biasv0;
-  int32x4_t biasv1;
-  int8_t min = -127;
-  int8x8_t minv = vdup_n_s8(min);
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    bias_ptr = bias;
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      biasv0 = vld1q_s32(bias_ptr);
-      biasv1 = vld1q_s32(bias_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv0);
-      cv1 = vqaddq_s32(cv1, biasv1);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-      vst1_s8(C_ptr, cv_b);
-      c_ptr += 8;
-      C_ptr += 8;
-      bias_ptr += 8;
-    }
-    if (_nc1 != 0) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      biasv0 = vld1q_s32(bias_ptr);
-      biasv1 = vld1q_s32(bias_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv0);
-      cv1 = vqaddq_s32(cv1, biasv1);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      cv_b = vmax_s8(cv_b, minv);
-
-      switch (_nc1) {
-        case 7:
-          vst1_lane_s8(C_ptr + 6, cv_b, 6);
-        case 6:
-          vst1_lane_s8(C_ptr + 5, cv_b, 5);
-        case 5:
-          vst1_lane_s8(C_ptr + 4, cv_b, 4);
-        case 4:
-          vst1_lane_s8(C_ptr + 3, cv_b, 3);
-        case 3:
-          vst1_lane_s8(C_ptr + 2, cv_b, 2);
-        case 2:
-          vst1_lane_s8(C_ptr + 1, cv_b, 1);
-        case 1:
-          vst1_lane_s8(C_ptr, cv_b, 0);
-        default:
-          break;
-      }
-    }
-  }
-#else
-  int8_t narrow = -128;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.8     d24,  %[narrow]         \n\t"
-        "loop_mc_%=:                        \n\t"
-        "mov        r4,   %[bias_ptr]       \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q13, q14}, [r4]!        \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q14             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vceq.s8    d15, d14, d24           \n\t"
-        "vsub.s8    d14, d14, d15           \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [narrow] "r"(narrow)
-        : "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5",
-          "q6", "q7", "q12", "q13", "q15");
-  }
-
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t *volatile bias0 = bias_ptr + nc1 * 8;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.8     d24,  %[narrow]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vld1.32    {q13}, [%[bias0]]!      \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]           \n\t"
-          "vld1.32    {q13}, [%[bias0]]       \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0),
-            [scale] "r"(scale), [narrow] "r"(narrow)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-// C = A * B + bias, scale * relu(C), bias is added on column
-void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                                 int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-
-  int32_t *c_ptr;
-  int8_t *C_ptr;
-  int32x4_t cv0;
-  int32x4_t cv1;
-  int16x8_t cv_h;
-  int8x8_t cv_b;
-  int32x4_t biasv;
-  int32x4_t zero = vdupq_n_s32(0);
-  for (int32_t i = 0; i < mc; ++i) {
-    c_ptr = c + i * NC;
-    C_ptr = C + i * ldc;
-    biasv = vld1q_dup_s32(bias + i);
-    for (int32_t j = 0; j < nc1; ++j) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-      cv0 = vmaxq_s32(cv0, zero);
-      cv1 = vmaxq_s32(cv1, zero);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      vst1_s8(C_ptr, cv_b);
-      c_ptr += 8;
-      C_ptr += 8;
-    }
-    if (_nc1 != 0) {
-      cv0 = vld1q_s32(c_ptr);
-      cv1 = vld1q_s32(c_ptr + 4);
-      cv0 = vqaddq_s32(cv0, biasv);
-      cv1 = vqaddq_s32(cv1, biasv);
-      cv0 = vmaxq_s32(cv0, zero);
-      cv1 = vmaxq_s32(cv1, zero);
-
-      cv_h = vcombine_s16(vqmovn_s32(cv0), vqmovn_s32(cv1));
-      cv_b = vqmovn_s16(cv_h);
-
-      switch (_nc1) {
-        case 7:
-          vst1_lane_s8(C_ptr + 6, cv_b, 6);
-        case 6:
-          vst1_lane_s8(C_ptr + 5, cv_b, 5);
-        case 5:
-          vst1_lane_s8(C_ptr + 4, cv_b, 4);
-        case 4:
-          vst1_lane_s8(C_ptr + 3, cv_b, 3);
-        case 3:
-          vst1_lane_s8(C_ptr + 2, cv_b, 2);
-        case 2:
-          vst1_lane_s8(C_ptr + 1, cv_b, 1);
-        case 1:
-          vst1_lane_s8(C_ptr, cv_b, 0);
-        default:
-          break;
-      }
-    }
-  }
-#else
-  int32_t zero = 0;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.32    q14,  %[zero]           \n\t"
-        "loop_mc_%=:                        \n\t"
-        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
-        "vdup.32    q13,  d26[0]            \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q13             \n\t"
-        "vmax.s32   q0, q0, q14             \n\t"
-        "vmax.s32   q1, q1, q14             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [zero] "r"(zero)
-        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-          "q7", "q13", "q14", "q15");
-  }
-
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t bias_v;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      bias_v = *(bias_ptr + i);
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.32    q14,  %[zero]           \n\t"
-          "vdup.32    q13,  %[bias_v]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vmax.s32   q0, q0, q14             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vmax.s32   q0, q0, q14             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gemm_omp_int8.cpp b/mobile/src/operators/math/gemm_omp_int8.cpp
deleted file mode 100644
index 2ea4520181744b812ac419e1e7e9cf41e72f830d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gemm_omp_int8.cpp
+++ /dev/null
@@ -1,453 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string.h>
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
-                              const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-#pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += 8) {
-    int8_t *local_buffer = buffer + j * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j);
-#if __ARM_NEON
-#if __aarch64__
-// PackMatrixB_omp_8c used only for aarch32
-#else
-      asm volatile(
-          //          "pld        [%[b0]]                     \n\t"
-          "vld1.s8    {d0},   [%[b0]]         \n\t"
-          "vst1.s8    {d0},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer)
-          : [b0] "r"(b0)
-          : "memory", "q0");
-#endif  // __aarch64__
-#else
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-      *local_buffer++ = *b0++;
-#endif  // __ARM_NEON
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * k;
-    for (int32_t i = 0; i < k; ++i) {
-      const int8_t *b0 = &B(i, j_length);
-      for (int32_t j = j_length; j < n; ++j) {
-        *local_buffer++ = *b0++;
-      }
-      for (int32_t j = n; j < j_length + 8; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
-                              const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-#pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * k;
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-    }
-  }
-
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * k;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k; ++j) {
-      *local_buffer++ = *a0++;
-      *local_buffer++ = *a1++;
-      *local_buffer++ = *a2++;
-      *local_buffer++ = *a3++;
-    }
-  }
-}
-
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                                 const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * KC;
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * KC;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-      asm volatile(
-          "ld1        {v0.16b},   [%[a0]],  #16    \n\t"
-          "ld1        {v1.16b},   [%[a1]],  #16    \n\t"
-          "ld1        {v2.16b},   [%[a2]],  #16    \n\t"
-          "ld1        {v3.16b},   [%[a3]],  #16    \n\t"
-          "st1        {v0.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v1.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v2.16b},   [%[local_buffer]],  #16   \n\t"
-          "st1        {v3.16b},   [%[local_buffer]],  #16   \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "v0", "v1", "v2", "v3");
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                                 const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += 2) {
-    int8_t *local_buffer = buffer + j * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j);
-      const int8_t *b1 = &B((i << 4), j + 1);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j);
-      const int8_t *b1 = &B((k_count << 4), j + 1);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j_length);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = 0;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j_length);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_omp_4c_16(int32_t k, int32_t n, int32_t n_tail,
-                                 const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t j = 0; j < n; j += 4) {
-    int8_t *local_buffer = buffer + j * KC;
-    const int8_t *b0 = &B(0, j);
-    const int8_t *b1 = b0 + 1;
-    const int8_t *b2 = b0 + 2;
-    const int8_t *b3 = b0 + 3;
-    if (j > j_length) {
-      switch (n_tail) {
-        case 1:
-          b1 = zero_int8;
-        case 2:
-          b2 = zero_int8;
-        case 3:
-          b3 = zero_int8;
-          break;
-        default:
-          break;
-      }
-    }
-
-    for (int32_t i = 0; i < k_count; ++i) {
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b2;
-        b2 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b3;
-        b3 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/gpc.cpp b/mobile/src/operators/math/gpc.cpp
deleted file mode 100644
index 6b7700081a2ab6cb11187fad898e944390217db3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gpc.cpp
+++ /dev/null
@@ -1,2142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/math/gpc.h"
-
-namespace gpc {
-
-typedef struct lmt_shape { /* Local minima table                */
-  double y;                /* Y coordinate at local minimum     */
-  edge_node *first_bound;  /* Pointer to bound list             */
-  struct lmt_shape *next;  /* Pointer to next local minimum     */
-} lmt_node;
-
-typedef struct sbt_t_shape { /* Scanbeam tree                     */
-  double y;                  /* Scanbeam node y value             */
-  struct sbt_t_shape *less;  /* Pointer to nodes with lower y     */
-  struct sbt_t_shape *more;  /* Pointer to nodes with higher y    */
-} sb_tree;
-
-typedef struct it_shape { /* Intersection table                */
-  edge_node *ie[2];       /* Intersecting edge (bundle) pair   */
-  gpc_vertex point;       /* Point of intersection             */
-  struct it_shape *next;  /* The next intersection table node  */
-} it_node;
-
-typedef struct st_shape { /* Sorted edge table                 */
-  edge_node *edge;        /* Pointer to AET edge               */
-  double xb;              /* Scanbeam bottom x coordinate      */
-  double xt;              /* Scanbeam top x coordinate         */
-  double dx;              /* Change in x for a unit y increase */
-  struct st_shape *prev;  /* Previous edge in sorted list      */
-} st_node;
-
-typedef struct bbox_shape { /* Contour axis-aligned bounding box */
-  double xmin;              /* Minimum x coordinate              */
-  double ymin;              /* Minimum y coordinate              */
-  double xmax;              /* Maximum x coordinate              */
-  double ymax;              /* Maximum y coordinate              */
-} bbox;
-
-/*
-===========================================================================
-                               Global Data
-===========================================================================
-*/
-
-/* Horizontal edge state transitions within scanbeam boundary */
-const h_state next_h_state[3][6] = {
-    /*        ABOVE     BELOW     CROSS */
-    /*        L   R     L   R     L   R */
-    /* NH */
-    {BH, TH, TH, BH, NH, NH},
-    /* BH */
-    {NH, NH, NH, NH, TH, TH},
-    /* TH */
-    {NH, NH, NH, NH, BH, BH}};
-
-/*
-===========================================================================
-                             Private Functions
-===========================================================================
-*/
-
-static void reset_it(it_node **it) {
-  it_node *itn;
-
-  while (*it) {
-    itn = (*it)->next;
-    gpc_free<it_node>(*it);
-    *it = itn;
-  }
-}
-
-static void reset_lmt(lmt_node **lmt) {
-  lmt_node *lmtn;
-
-  while (*lmt) {
-    lmtn = (*lmt)->next;
-    gpc_free<lmt_node>(*lmt);
-    *lmt = lmtn;
-  }
-}
-
-static void insert_bound(edge_node **b, edge_node *e) {
-  edge_node *existing_bound = NULL;
-
-  if (!*b) {
-    /* Link node e to the tail of the list */
-    *b = e;
-  } else {
-    /* Do primary sort on the x field */
-    if (e[0].bot.x < (*b)[0].bot.x) {
-      /* Insert a new node mid-list */
-      existing_bound = *b;
-      *b = e;
-      (*b)->next_bound = existing_bound;
-    } else {
-      if (e[0].bot.x == (*b)[0].bot.x) {
-        /* Do secondary sort on the dx field */
-        if (e[0].dx < (*b)[0].dx) {
-          /* Insert a new node mid-list */
-          existing_bound = *b;
-          *b = e;
-          (*b)->next_bound = existing_bound;
-        } else {
-          /* Head further down the list */
-          insert_bound(&((*b)->next_bound), e);
-        }
-      } else {
-        /* Head further down the list */
-        insert_bound(&((*b)->next_bound), e);
-      }
-    }
-  }
-}
-
-static edge_node **bound_list(lmt_node **lmt, double y) {
-  lmt_node *existing_node;
-
-  if (!*lmt) {
-    /* Add node onto the tail end of the LMT */
-    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
-                         const_cast<char *>("LMT insertion"));
-    (*lmt)->y = y;
-    (*lmt)->first_bound = NULL;
-    (*lmt)->next = NULL;
-    return &((*lmt)->first_bound);
-  } else if (y < (*lmt)->y) {
-    /* Insert a new LMT node before the current node */
-    existing_node = *lmt;
-    gpc_malloc<lmt_node>(*lmt, sizeof(lmt_node),
-                         const_cast<char *>("LMT insertion"));
-    (*lmt)->y = y;
-    (*lmt)->first_bound = NULL;
-    (*lmt)->next = existing_node;
-    return &((*lmt)->first_bound);
-  } else {
-    if (y > (*lmt)->y) {
-      /* Head further up the LMT */
-      return bound_list(&((*lmt)->next), y);
-    } else {
-      /* Use this existing LMT node */
-      return &((*lmt)->first_bound);
-    }
-  }
-}
-
-static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) {
-  if (!*sbtree) {
-    /* Add a new tree node here */
-    gpc_malloc<sb_tree>(*sbtree, sizeof(sb_tree),
-                        const_cast<char *>("scanbeam tree insertion"));
-    (*sbtree)->y = y;
-    (*sbtree)->less = NULL;
-    (*sbtree)->more = NULL;
-    (*entries)++;
-  } else {
-    if ((*sbtree)->y > y) {
-      /* Head into the 'less' sub-tree */
-      add_to_sbtree(entries, &((*sbtree)->less), y);
-    } else {
-      if ((*sbtree)->y < y) {
-        /* Head into the 'more' sub-tree */
-        add_to_sbtree(entries, &((*sbtree)->more), y);
-      }
-    }
-  }
-}
-
-static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) {
-  if (sbtree->less) {
-    build_sbt(entries, sbt, sbtree->less);
-  }
-  sbt[*entries] = sbtree->y;
-  (*entries)++;
-  if (sbtree->more) {
-    build_sbt(entries, sbt, sbtree->more);
-  }
-}
-
-static void free_sbtree(sb_tree **sbtree) {
-  if (*sbtree) {
-    free_sbtree(&((*sbtree)->less));
-    free_sbtree(&((*sbtree)->more));
-    gpc_free<sb_tree>(*sbtree);
-  }
-}
-
-static int count_optimal_vertices(gpc_vertex_list c) {
-  int result = 0;
-  int i = 0;
-
-  /* Ignore non-contributing contours */
-  if (c.num_vertices > 0) {
-    for (i = 0; i < c.num_vertices; i++) {
-      /* Ignore superfluous vertices embedded in horizontal edges */
-      if (gpc_optimal(c.vertex, i, c.num_vertices)) {
-        result++;
-      }
-    }
-  }
-  return result;
-}
-
-static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries,
-                            gpc_polygon *p, int type, gpc_op op) {
-  int c = 0;
-  int i = 0;
-  int min = 0;
-  int max = 0;
-  int num_edges = 0;
-  int v = 0;
-  int num_vertices = 0;
-  int total_vertices = 0;
-  int e_index = 0;
-  edge_node *e = NULL;
-  edge_node *edge_table = NULL;
-
-  for (c = 0; c < p->num_contours; c++) {
-    total_vertices += count_optimal_vertices(p->contour[c]);
-  }
-
-  /* Create the entire input polygon edge table in one go */
-  gpc_malloc<edge_node>(edge_table, total_vertices * sizeof(edge_node),
-                        const_cast<char *>("edge table creation"));
-
-  for (c = 0; c < p->num_contours; c++) {
-    if (p->contour[c].num_vertices < 0) {
-      /* Ignore the non-contributing contour and repair the vertex count */
-      p->contour[c].num_vertices = -p->contour[c].num_vertices;
-    } else {
-      /* Perform contour optimisation */
-      num_vertices = 0;
-      for (i = 0; i < p->contour[c].num_vertices; i++) {
-        if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) {
-          edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x;
-          edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y;
-
-          /* Record vertex in the scanbeam table */
-          add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y);
-
-          num_vertices++;
-        }
-      }
-
-      /* Do the contour forward pass */
-      for (min = 0; min < num_vertices; min++) {
-        /* If a forward local minimum... */
-        if (gpc_fwd_min(edge_table, min, num_vertices)) {
-          /* Search for the next local maximum... */
-          num_edges = 1;
-          max = gpc_next_index(min, num_vertices);
-          while (gpc_not_fmax(edge_table, max, num_vertices)) {
-            num_edges++;
-            max = gpc_next_index(max, num_vertices);
-          }
-
-          /* Build the next edge list */
-          e = &edge_table[e_index];
-          e_index += num_edges;
-          v = min;
-          e[0].bstate[BELOW] = UNBUNDLED;
-          e[0].bundle[BELOW][CLIP] = 0;
-          e[0].bundle[BELOW][SUBJ] = 0;
-          for (i = 0; i < num_edges; i++) {
-            e[i].xb = edge_table[v].vertex.x;
-            e[i].bot.x = edge_table[v].vertex.x;
-            e[i].bot.y = edge_table[v].vertex.y;
-
-            v = gpc_next_index(v, num_vertices);
-
-            e[i].top.x = edge_table[v].vertex.x;
-            e[i].top.y = edge_table[v].vertex.y;
-            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
-                      (e[i].top.y - e[i].bot.y);
-            e[i].type = type;
-            e[i].outp[ABOVE] = NULL;
-            e[i].outp[BELOW] = NULL;
-            e[i].next = NULL;
-            e[i].prev = NULL;
-            e[i].succ =
-                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
-            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
-            e[i].next_bound = NULL;
-            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
-            e[i].bside[SUBJ] = LEFT;
-          }
-          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
-        }
-      }
-
-      /* Do the contour reverse pass */
-      for (min = 0; min < num_vertices; min++) {
-        /* If a reverse local minimum... */
-        if (gpc_rev_min(edge_table, min, num_vertices)) {
-          /* Search for the previous local maximum... */
-          num_edges = 1;
-          max = gpc_prev_index(min, num_vertices);
-          while (gpc_not_rmax(edge_table, max, num_vertices)) {
-            num_edges++;
-            max = gpc_prev_index(max, num_vertices);
-          }
-
-          /* Build the previous edge list */
-          e = &edge_table[e_index];
-          e_index += num_edges;
-          v = min;
-          e[0].bstate[BELOW] = UNBUNDLED;
-          e[0].bundle[BELOW][CLIP] = 0;
-          e[0].bundle[BELOW][SUBJ] = 0;
-          for (i = 0; i < num_edges; i++) {
-            e[i].xb = edge_table[v].vertex.x;
-            e[i].bot.x = edge_table[v].vertex.x;
-            e[i].bot.y = edge_table[v].vertex.y;
-
-            v = gpc_prev_index(v, num_vertices);
-
-            e[i].top.x = edge_table[v].vertex.x;
-            e[i].top.y = edge_table[v].vertex.y;
-            e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) /
-                      (e[i].top.y - e[i].bot.y);
-            e[i].type = type;
-            e[i].outp[ABOVE] = NULL;
-            e[i].outp[BELOW] = NULL;
-            e[i].next = NULL;
-            e[i].prev = NULL;
-            e[i].succ =
-                ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL;
-            e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL;
-            e[i].next_bound = NULL;
-            e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT;
-            e[i].bside[SUBJ] = LEFT;
-          }
-          insert_bound(bound_list(lmt, edge_table[min].vertex.y), e);
-        }
-      }
-    }
-  }
-  return edge_table;
-}  // NOLINT
-
-static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) {
-  if (!*aet) {
-    /* Append edge onto the tail end of the AET */
-    *aet = edge;
-    edge->prev = prev;
-    edge->next = NULL;
-  } else {
-    /* Do primary sort on the xb field */
-    if (edge->xb < (*aet)->xb) {
-      /* Insert edge here (before the AET edge) */
-      edge->prev = prev;
-      edge->next = *aet;
-      (*aet)->prev = edge;
-      *aet = edge;
-    } else {
-      if (edge->xb == (*aet)->xb) {
-        /* Do secondary sort on the dx field */
-        if (edge->dx < (*aet)->dx) {
-          /* Insert edge here (before the AET edge) */
-          edge->prev = prev;
-          edge->next = *aet;
-          (*aet)->prev = edge;
-          *aet = edge;
-        } else {
-          /* Head further into the AET */
-          add_edge_to_aet(&((*aet)->next), edge, *aet);
-        }
-      } else {
-        /* Head further into the AET */
-        add_edge_to_aet(&((*aet)->next), edge, *aet);
-      }
-    }
-  }
-}
-
-static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1,
-                             double x, double y) {
-  it_node *existing_node;
-
-  if (!*it) {
-    /* Append a new node to the tail of the list */
-    gpc_malloc<it_node>(*it, sizeof(it_node),
-                        const_cast<char *>("IT insertion"));
-    (*it)->ie[0] = edge0;
-    (*it)->ie[1] = edge1;
-    (*it)->point.x = x;
-    (*it)->point.y = y;
-    (*it)->next = NULL;
-  } else {
-    if ((*it)->point.y > y) {
-      /* Insert a new node mid-list */
-      existing_node = *it;
-      gpc_malloc<it_node>(*it, sizeof(it_node),
-                          const_cast<char *>("IT insertion"));
-      (*it)->ie[0] = edge0;
-      (*it)->ie[1] = edge1;
-      (*it)->point.x = x;
-      (*it)->point.y = y;
-      (*it)->next = existing_node;
-    } else {
-      /* Head further down the list */
-      add_intersection(&((*it)->next), edge0, edge1, x, y);
-    }
-  }
-}
-
-static void add_st_edge(st_node **st, it_node **it, edge_node *edge,
-                        double dy) {
-  st_node *existing_node;
-  double den = 0.0;
-  double r = 0.0;
-  double x = 0.0;
-  double y = 0.0;
-
-  if (!*st) {
-    /* Append edge onto the tail end of the ST */
-    gpc_malloc<st_node>(*st, sizeof(st_node),
-                        const_cast<char *>("ST insertion"));
-    (*st)->edge = edge;
-    (*st)->xb = edge->xb;
-    (*st)->xt = edge->xt;
-    (*st)->dx = edge->dx;
-    (*st)->prev = NULL;
-  } else {
-    den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb);
-
-    /* If new edge and ST edge don't cross */
-    if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) ||
-        (fabs(den) <= DBL_EPSILON)) {
-      /* No intersection - insert edge here (before the ST edge) */
-      existing_node = *st;
-      gpc_malloc<st_node>(*st, sizeof(st_node),
-                          const_cast<char *>("ST insertion"));
-      (*st)->edge = edge;
-      (*st)->xb = edge->xb;
-      (*st)->xt = edge->xt;
-      (*st)->dx = edge->dx;
-      (*st)->prev = existing_node;
-    } else {
-      /* Compute intersection between new edge and ST edge */
-      r = (edge->xb - (*st)->xb) / den;
-      x = (*st)->xb + r * ((*st)->xt - (*st)->xb);
-      y = r * dy;
-
-      /* Insert the edge pointers and the intersection point in the IT */
-      add_intersection(it, (*st)->edge, edge, x, y);
-
-      /* Head further into the ST */
-      add_st_edge(&((*st)->prev), it, edge, dy);
-    }
-  }
-}
-
-static void build_intersection_table(it_node **it, edge_node *aet, double dy) {
-  st_node *st;
-  st_node *stp;
-  edge_node *edge = NULL;
-
-  /* Build intersection table for the current scanbeam */
-  reset_it(it);
-  st = NULL;
-
-  /* Process each AET edge */
-  for (edge = aet; edge; edge = edge->next) {
-    if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] ||
-        edge->bundle[ABOVE][SUBJ]) {
-      add_st_edge(&st, it, edge, dy);
-    }
-  }
-
-  /* Free the sorted edge table */
-  while (st) {
-    stp = st->prev;
-    gpc_free<st_node>(st);
-    st = stp;
-  }
-}
-
-static int count_contours(polygon_node *polygon) {
-  int nc = 0;
-  int nv = 0;
-  vertex_node *v = NULL;
-  vertex_node *nextv = NULL;
-
-  for (nc = 0; polygon; polygon = polygon->next) {
-    if (polygon->active) {
-      /* Count the vertices in the current contour */
-      nv = 0;
-      for (v = polygon->proxy->v[LEFT]; v; v = v->next) {
-        nv++;
-      }
-
-      /* Record valid vertex counts in the active field */
-      if (nv > 2) {
-        polygon->active = nv;
-        nc++;
-      } else {
-        /* Invalid contour: just free the heap */
-        for (v = polygon->proxy->v[LEFT]; v; v = nextv) {
-          nextv = v->next;
-          gpc_free<vertex_node>(v);
-        }
-        polygon->active = 0;
-      }
-    }
-  }
-  return nc;
-}
-
-static void add_left(polygon_node *p, double x, double y) {
-  vertex_node *nv = NULL;
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-
-  /* Add vertex nv to the left end of the polygon's vertex list */
-  nv->next = p->proxy->v[LEFT];
-
-  /* Update proxy->[LEFT] to point to nv */
-  p->proxy->v[LEFT] = nv;
-}
-
-static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) {
-  polygon_node *target = NULL;
-
-  /* Label contour as a hole */
-  q->proxy->hole = 1;
-
-  if (p->proxy != q->proxy) {
-    /* Assign p's vertex list to the left end of q's list */
-    p->proxy->v[RIGHT]->next = q->proxy->v[LEFT];
-    q->proxy->v[LEFT] = p->proxy->v[LEFT];
-
-    /* Redirect any p->proxy references to q->proxy */
-
-    for (target = p->proxy; list; list = list->next) {
-      if (list->proxy == target) {
-        list->active = 0;
-        list->proxy = q->proxy;
-      }
-    }
-  }
-}
-
-static void add_right(polygon_node *p, double x, double y) {
-  vertex_node *nv = NULL;
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-  nv->next = NULL;
-
-  /* Add vertex nv to the right end of the polygon's vertex list */
-  p->proxy->v[RIGHT]->next = nv;
-
-  /* Update proxy->v[RIGHT] to point to nv */
-  p->proxy->v[RIGHT] = nv;
-}
-
-static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  polygon_node *target = NULL;
-
-  /* Label contour as external */
-  q->proxy->hole = 0;
-
-  if (p->proxy != q->proxy) {
-    /* Assign p's vertex list to the right end of q's list */
-    q->proxy->v[RIGHT]->next = p->proxy->v[LEFT];
-    q->proxy->v[RIGHT] = p->proxy->v[RIGHT];
-
-    /* Redirect any p->proxy references to q->proxy */
-    for (target = p->proxy; list; list = list->next) {
-      if (list->proxy == target) {
-        list->active = 0;
-        list->proxy = q->proxy;
-      }
-    }
-  }
-}
-
-static void add_local_min(polygon_node **p, edge_node *edge, double x,
-                          double y) {
-  polygon_node *existing_min = NULL;
-  vertex_node *nv = NULL;
-
-  existing_min = *p;
-
-  gpc_malloc<polygon_node>(*p, sizeof(polygon_node),
-                           const_cast<char *>("polygon node creation"));
-
-  /* Create a new vertex node and set its fields */
-  gpc_malloc<vertex_node>(nv, sizeof(vertex_node),
-                          const_cast<char *>("vertex node creation"));
-  nv->x = x;
-  nv->y = y;
-  nv->next = NULL;
-
-  /* Initialise proxy to point to p itself */
-  (*p)->proxy = (*p);
-  (*p)->active = 1;
-  (*p)->next = existing_min;
-
-  /* Make v[LEFT] and v[RIGHT] point to new vertex nv */
-  (*p)->v[LEFT] = nv;
-  (*p)->v[RIGHT] = nv;
-
-  /* Assign polygon p to the edge */
-  edge->outp[ABOVE] = *p;
-}
-
-static int count_tristrips(polygon_node *tn) {
-  int total = 0;
-
-  for (total = 0; tn; tn = tn->next) {
-    if (tn->active > 2) {
-      total++;
-    }
-  }
-  return total;
-}
-
-void add_vertex(vertex_node **t, double x, double y) {
-  if (!(*t)) {
-    gpc_malloc<vertex_node>(*t, sizeof(vertex_node),
-                            const_cast<char *>("tristrip vertex creation"));
-    (*t)->x = x;
-    (*t)->y = y;
-    (*t)->next = NULL;
-  } else {
-    /* Head further down the list */
-    add_vertex(&((*t)->next), x, y);
-  }
-}
-
-void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  add_vertex(&(e->outp[p]->v[s]), x, y);
-  e->outp[p]->active++;
-}
-
-static void new_tristrip(polygon_node **tn, edge_node *edge, double x,
-                         double y) {
-  if (!(*tn)) {
-    gpc_malloc<polygon_node>(*tn, sizeof(polygon_node),
-                             const_cast<char *>("tristrip node creation"));
-    (*tn)->next = NULL;
-    (*tn)->v[LEFT] = NULL;
-    (*tn)->v[RIGHT] = NULL;
-    (*tn)->active = 1;
-    add_vertex(&((*tn)->v[LEFT]), x, y);
-    edge->outp[ABOVE] = *tn;
-  } else {
-    /* Head further down the list */
-    new_tristrip(&((*tn)->next), edge, x, y);
-  }
-}
-
-static bbox *create_contour_bboxes(gpc_polygon *p) {
-  bbox *box;
-  int c = 0;
-  int v = 0;
-
-  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
-                   const_cast<char *>("Bounding box creation"));
-
-  /* Construct contour bounding boxes */
-  for (c = 0; c < p->num_contours; c++) {
-    /* Initialise bounding box extent */
-    box[c].xmin = DBL_MAX;
-    box[c].ymin = DBL_MAX;
-    box[c].xmax = -DBL_MAX;
-    box[c].ymax = -DBL_MAX;
-
-    for (v = 0; v < p->contour[c].num_vertices; v++) {
-      /* Adjust bounding box */
-      if (p->contour[c].vertex[v].x < box[c].xmin) {
-        box[c].xmin = p->contour[c].vertex[v].x;
-      }
-      if (p->contour[c].vertex[v].y < box[c].ymin) {
-        box[c].ymin = p->contour[c].vertex[v].y;
-      }
-      if (p->contour[c].vertex[v].x > box[c].xmax) {
-        box[c].xmax = p->contour[c].vertex[v].x;
-      }
-      if (p->contour[c].vertex[v].y > box[c].ymax) {
-        box[c].ymax = p->contour[c].vertex[v].y;
-      }
-    }
-  }
-  return box;
-}
-
-static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) {
-  bbox *s_bbox;
-  bbox *c_bbox;
-  int s = 0;
-  int c = 0;
-  int *o_table = NULL;
-  int overlap = 0;
-
-  s_bbox = create_contour_bboxes(subj);
-  c_bbox = create_contour_bboxes(clip);
-
-  gpc_malloc<int>(o_table,
-                  subj->num_contours * clip->num_contours * sizeof(int),
-                  const_cast<char *>("overlap table creation"));
-
-  /* Check all subject contour bounding boxes against clip boxes */
-  for (s = 0; s < subj->num_contours; s++) {
-    for (c = 0; c < clip->num_contours; c++) {
-      o_table[c * subj->num_contours + s] =
-          (!((s_bbox[s].xmax < c_bbox[c].xmin) ||
-             (s_bbox[s].xmin > c_bbox[c].xmax))) &&
-          (!((s_bbox[s].ymax < c_bbox[c].ymin) ||
-             (s_bbox[s].ymin > c_bbox[c].ymax)));
-    }
-  }
-
-  /* For each clip contour, search for any subject contour overlaps */
-  for (c = 0; c < clip->num_contours; c++) {
-    overlap = 0;
-    for (s = 0; (!overlap) && (s < subj->num_contours); s++) {
-      overlap = o_table[c * subj->num_contours + s];
-    }
-
-    if (!overlap) {
-      /* Flag non contributing status by negating vertex count */
-      clip->contour[c].num_vertices = -clip->contour[c].num_vertices;
-    }
-  }
-
-  if (op == GPC_INT) {
-    /* For each subject contour, search for any clip contour overlaps */
-    for (s = 0; s < subj->num_contours; s++) {
-      overlap = 0;
-      for (c = 0; (!overlap) && (c < clip->num_contours); c++) {
-        overlap = o_table[c * subj->num_contours + s];
-      }
-
-      if (!overlap) {
-        /* Flag non contributing status by negating vertex count */
-        subj->contour[s].num_vertices = -subj->contour[s].num_vertices;
-      }
-    }
-  }
-
-  gpc_free<bbox>(s_bbox);
-  gpc_free<bbox>(c_bbox);
-  gpc_free<int>(o_table);
-}
-
-/*
-===========================================================================
-                             Public Functions
-===========================================================================
-*/
-
-void gpc_free_polygon(gpc_polygon *p) {
-  int c = 0;
-
-  for (c = 0; c < p->num_contours; c++) {
-    gpc_free<gpc_vertex>(p->contour[c].vertex);
-  }
-  gpc_free<int>(p->hole);
-  gpc_free<gpc_vertex_list>(p->contour);
-  p->num_contours = 0;
-}
-
-void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
-  int *extended_hole = NULL;
-  int c = 0;
-  int v = 0;
-  gpc_vertex_list *extended_contour = NULL;
-
-  /* Create an extended hole array */
-  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
-                  const_cast<char *>("contour hole addition"));
-
-  /* Create an extended contour array */
-  gpc_malloc<gpc_vertex_list>(extended_contour,
-                              (p->num_contours + 1) * sizeof(gpc_vertex_list),
-                              const_cast<char *>("contour addition"));
-
-  /* Copy the old contour and hole data into the extended arrays */
-  for (c = 0; c < p->num_contours; c++) {
-    extended_hole[c] = p->hole[c];
-    extended_contour[c] = p->contour[c];
-  }
-
-  /* Copy the new contour and hole onto the end of the extended arrays */
-  c = p->num_contours;
-  extended_hole[c] = hole;
-  extended_contour[c].num_vertices = new_contour->num_vertices;
-  gpc_malloc<gpc_vertex>(extended_contour[c].vertex,
-                         new_contour->num_vertices * sizeof(gpc_vertex),
-                         const_cast<char *>("contour addition"));
-  for (v = 0; v < new_contour->num_vertices; v++) {
-    extended_contour[c].vertex[v] = new_contour->vertex[v];
-  }
-
-  /* Dispose of the old contour */
-  gpc_free<gpc_vertex_list>(p->contour);
-  gpc_free<int>(p->hole);
-
-  /* Update the polygon information */
-  p->num_contours++;
-  p->hole = extended_hole;
-  p->contour = extended_contour;
-}
-
-// gpc_polygon_clip
-void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
-                      gpc_polygon *result) {
-  sb_tree *sbtree = NULL;
-  it_node *it = NULL;
-  it_node *intersect = NULL;
-  edge_node *edge = NULL;
-  edge_node *prev_edge = NULL;
-  edge_node *next_edge = NULL;
-  edge_node *succ_edge = NULL;
-  edge_node *e0 = NULL;
-  edge_node *e1 = NULL;
-  edge_node *aet = NULL;
-  edge_node *c_heap = NULL;
-  edge_node *s_heap = NULL;
-  lmt_node *lmt = NULL;
-  lmt_node *local_min = NULL;
-  polygon_node *out_poly = NULL;
-  polygon_node *p = NULL;
-  polygon_node *q = NULL;
-  polygon_node *poly = NULL;
-  polygon_node *npoly = NULL;
-  polygon_node *cf = NULL;
-  vertex_node *vtx = NULL;
-  vertex_node *nv = NULL;
-  h_state horiz[2];
-  int in[2];
-  int exists[2];
-  int parity[2] = {LEFT, LEFT};
-  int c = 0;
-  int v = 0;
-  int contributing = 0;
-  int search = 0;
-  int scanbeam = 0;
-  int sbt_entries = 0;
-  int vclass = 0;
-  int bl = 0;
-  int br = 0;
-  int tl = 0;
-  int tr = 0;
-  double *sbt = NULL;
-  double xb = 0.0;
-  double px = 0.0;
-  double yb = 0.0;
-  double yt = 0.0;
-  double dy = 0.0;
-  double ix = 0.0;
-  double iy = 0.0;
-
-  /* Test for trivial NULL result cases */
-  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
-      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
-      ((clip->num_contours == 0) && (op == GPC_INT))) {
-    result->num_contours = 0;
-    result->hole = NULL;
-    result->contour = NULL;
-    return;
-  }
-  /* Identify potentialy contributing contours */
-  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
-      (clip->num_contours > 0)) {
-    minimax_test(subj, clip, op);
-  }
-  /* Build LMT */
-  if (subj->num_contours > 0) {
-    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
-  }
-  if (clip->num_contours > 0) {
-    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
-  }
-  /* Return a NULL result if no contours contribute */
-  if (lmt == NULL) {
-    result->num_contours = 0;
-    result->hole = NULL;
-    result->contour = NULL;
-    reset_lmt(&lmt);
-    gpc_free<edge_node>(s_heap);
-    gpc_free<edge_node>(c_heap);
-    return;
-  }
-
-  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
-                     const_cast<char *>("sbt creation"));
-  build_sbt(&scanbeam, sbt, sbtree);
-  scanbeam = 0;
-  free_sbtree(&sbtree);
-  /* Allow pointer re-use without causing memory leak */
-  if (subj == result) {
-    gpc_free_polygon(subj);
-  }
-  if (clip == result) {
-    gpc_free_polygon(clip);
-  }
-  /* Invert clip polygon for difference operation */
-  if (op == GPC_DIFF) {
-    parity[CLIP] = RIGHT;
-  }
-  local_min = lmt;
-
-  // Process each scanbeam
-  while (scanbeam < sbt_entries) {
-    /* Set yb and yt to the bottom and top of the scanbeam */
-    yb = sbt[scanbeam++];
-    if (scanbeam < sbt_entries) {
-      yt = sbt[scanbeam];
-      dy = yt - yb;
-    }
-    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
-    /* If LMT node corresponding to yb exists */
-    if (local_min) {
-      if (local_min->y == yb) {
-        /* Add edges starting at this local minimum to the AET */
-        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
-          add_edge_to_aet(&aet, edge, NULL);
-        }
-        local_min = local_min->next;
-      }
-    }
-    /* Set dummy previous x value */
-    px = -DBL_MAX;
-    /* Create bundles within AET */
-    e0 = aet;
-    e1 = aet;
-    /* Set up bundle fields of first edge */
-    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
-    aet->bundle[ABOVE][!aet->type] = 0;
-    aet->bstate[ABOVE] = UNBUNDLED;
-
-    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
-      /* Set up bundle fields of next edge */
-      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
-      next_edge->bundle[ABOVE][!next_edge->type] = 0;
-      next_edge->bstate[ABOVE] = UNBUNDLED;
-      /* Bundle edges above the scanbeam boundary if they coincide */
-      if (next_edge->bundle[ABOVE][next_edge->type]) {
-        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
-            (e0->top.y != yb)) {
-          next_edge->bundle[ABOVE][next_edge->type] ^=
-              e0->bundle[ABOVE][next_edge->type];
-          next_edge->bundle[ABOVE][!next_edge->type] =
-              e0->bundle[ABOVE][!next_edge->type];
-          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
-          e0->bundle[ABOVE][CLIP] = 0;
-          e0->bundle[ABOVE][SUBJ] = 0;
-          e0->bstate[ABOVE] = BUNDLE_TAIL;
-        }
-        e0 = next_edge;
-      }
-    }
-    horiz[CLIP] = NH;
-    horiz[SUBJ] = NH;
-
-    // Process each edge at this scanbeam boundary
-    for (edge = aet; edge; edge = edge->next) {
-      exists[CLIP] =
-          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
-      exists[SUBJ] =
-          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
-      if (exists[CLIP] || exists[SUBJ]) {
-        /* Set bundle side */
-        edge->bside[CLIP] = parity[CLIP];
-        edge->bside[SUBJ] = parity[SUBJ];
-        /* Determine contributing status and quadrant occupancies */
-        switch (op) {
-          case GPC_DIFF:
-          case GPC_INT:
-            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) && (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_XOR:
-            contributing = exists[CLIP] || exists[SUBJ];
-            br = (parity[CLIP]) ^ (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_UNION:
-            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) || (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-        }
-        // Update parity
-        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
-        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
-        /* Update horizontal state */
-        if (exists[CLIP]) {
-          horiz[CLIP] = next_h_state[horiz[CLIP]]
-                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
-        }
-        if (exists[SUBJ]) {
-          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
-                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
-        }
-        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-        if (contributing) {
-          xb = edge->xb;
-          switch (vclass) {
-            case EMN:
-            case IMN:
-              add_local_min(&out_poly, edge, xb, yb);
-              px = xb;
-              cf = edge->outp[ABOVE];
-              break;
-            case ERI:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              edge->outp[ABOVE] = cf;
-              cf = NULL;
-              break;
-            case ELI:
-              add_left(edge->outp[BELOW], xb, yb);
-              px = xb;
-              cf = edge->outp[BELOW];
-              break;
-            case EMX:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              merge_right(cf, edge->outp[BELOW], out_poly);
-              cf = NULL;
-              break;
-            case ILI:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              edge->outp[ABOVE] = cf;
-              cf = NULL;
-              break;
-            case IRI:
-              add_right(edge->outp[BELOW], xb, yb);
-              px = xb;
-              cf = edge->outp[BELOW];
-              edge->outp[BELOW] = NULL;
-              break;
-            case IMX:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              merge_left(cf, edge->outp[BELOW], out_poly);
-              cf = NULL;
-              edge->outp[BELOW] = NULL;
-              break;
-            case IMM:
-              if (xb != px) {
-                add_right(cf, xb, yb);
-                px = xb;
-              }
-              merge_left(cf, edge->outp[BELOW], out_poly);
-              edge->outp[BELOW] = NULL;
-              add_local_min(&out_poly, edge, xb, yb);
-              cf = edge->outp[ABOVE];
-              break;
-            case EMM:
-              if (xb != px) {
-                add_left(cf, xb, yb);
-                px = xb;
-              }
-              merge_right(cf, edge->outp[BELOW], out_poly);
-              edge->outp[BELOW] = NULL;
-              add_local_min(&out_poly, edge, xb, yb);
-              cf = edge->outp[ABOVE];
-              break;
-            case LED:
-              if (edge->bot.y == yb) {
-                add_left(edge->outp[BELOW], xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              px = xb;
-              break;
-            case RED:
-              if (edge->bot.y == yb) {
-                add_right(edge->outp[BELOW], xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              px = xb;
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing conditional */
-      }     /* End of edge exists conditional */
-    }       // End of AET loop
-
-    /* Delete terminating edges from the AET, otherwise compute xt */
-    for (edge = aet; edge; edge = edge->next) {
-      if (edge->top.y == yb) {
-        prev_edge = edge->prev;
-        next_edge = edge->next;
-        if (prev_edge) {
-          prev_edge->next = next_edge;
-        } else {
-          aet = next_edge;
-        }
-        if (next_edge) {
-          next_edge->prev = prev_edge;
-        }
-        /* Copy bundle head state to the adjacent tail edge if required */
-        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
-          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
-            prev_edge->outp[BELOW] = edge->outp[BELOW];
-            prev_edge->bstate[BELOW] = UNBUNDLED;
-            if (prev_edge->prev) {
-              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
-                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
-              }
-            }
-          }
-        }
-      } else {
-        if (edge->top.y == yt) {
-          edge->xt = edge->top.x;
-        } else {
-          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
-        }
-      }
-    }
-
-    if (scanbeam < sbt_entries) {
-      /* === SCANBEAM INTERIOR PROCESSING ============================== */
-      build_intersection_table(&it, aet, dy);
-      /* Process each node in the intersection table */
-      for (intersect = it; intersect; intersect = intersect->next) {
-        e0 = intersect->ie[0];
-        e1 = intersect->ie[1];
-        /* Only generate output for contributing intersections */
-        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
-            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
-          p = e0->outp[ABOVE];
-          q = e1->outp[ABOVE];
-          ix = intersect->point.x;
-          iy = intersect->point.y + yb;
-
-          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
-                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
-                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
-                      e0->bside[CLIP] && e1->bside[CLIP]);
-          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
-                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
-                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
-                      e0->bside[SUBJ] && e1->bside[SUBJ]);
-
-          // Determine quadrant occupancies
-          switch (op) {
-            case GPC_DIFF:
-            case GPC_INT:
-              tr = (in[CLIP]) && (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_XOR:
-              tr = (in[CLIP]) ^ (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_UNION:
-              tr = (in[CLIP]) || (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-          }
-          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-          switch (vclass) {
-            case EMN:
-              add_local_min(&out_poly, e0, ix, iy);
-              e1->outp[ABOVE] = e0->outp[ABOVE];
-              break;
-            case ERI:
-              if (p) {
-                add_right(p, ix, iy);
-                e1->outp[ABOVE] = p;
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case ELI:
-              if (q) {
-                add_left(q, ix, iy);
-                e0->outp[ABOVE] = q;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case EMX:
-              if (p && q) {
-                add_left(p, ix, iy);
-                merge_right(p, q, out_poly);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMN:
-              add_local_min(&out_poly, e0, ix, iy);
-              e1->outp[ABOVE] = e0->outp[ABOVE];
-              break;
-            case ILI:
-              if (p) {
-                add_left(p, ix, iy);
-                e1->outp[ABOVE] = p;
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case IRI:
-              if (q) {
-                add_right(q, ix, iy);
-                e0->outp[ABOVE] = q;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMX:
-              if (p && q) {
-                add_right(p, ix, iy);
-                merge_left(p, q, out_poly);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMM:
-              if (p && q) {
-                add_right(p, ix, iy);
-                merge_left(p, q, out_poly);
-                add_local_min(&out_poly, e0, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-              }
-              break;
-            case EMM:
-              if (p && q) {
-                add_left(p, ix, iy);
-                merge_right(p, q, out_poly);
-                add_local_min(&out_poly, e0, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-              }
-              break;
-            default:
-              break;
-          }  // End of switch
-        }    /* End of contributing intersection conditional */
-
-        /* Swap bundle sides in response to edge crossing */
-        if (e0->bundle[ABOVE][CLIP]) {
-          e1->bside[CLIP] = !e1->bside[CLIP];
-        }
-        if (e1->bundle[ABOVE][CLIP]) {
-          e0->bside[CLIP] = !e0->bside[CLIP];
-        }
-        if (e0->bundle[ABOVE][SUBJ]) {
-          e1->bside[SUBJ] = !e1->bside[SUBJ];
-        }
-        if (e1->bundle[ABOVE][SUBJ]) {
-          e0->bside[SUBJ] = !e0->bside[SUBJ];
-        }
-
-        /* Swap e0 and e1 bundles in the AET */
-        prev_edge = e0->prev;
-        next_edge = e1->next;
-        if (next_edge) {
-          next_edge->prev = e0;
-        }
-        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
-          search = 1;
-          while (search) {
-            prev_edge = prev_edge->prev;
-            if (prev_edge) {
-              if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) {
-                search = 0;
-              }
-            } else {
-              search = 0;
-            }
-          }
-        }
-        if (!prev_edge) {
-          aet->prev = e1;
-          e1->next = aet;
-          aet = e0->next;
-        } else {
-          prev_edge->next->prev = e1;
-          e1->next = prev_edge->next;
-          prev_edge->next = e0->next;
-        }
-        e0->next->prev = prev_edge;
-        e1->next->prev = e1;
-        e0->next = next_edge;
-      } /* End of IT loop*/
-
-      // Prepare for next scanbeam
-      for (edge = aet; edge; edge = next_edge) {
-        next_edge = edge->next;
-        succ_edge = edge->succ;
-        if ((edge->top.y == yt) && succ_edge) {
-          /* Replace AET edge by its successor */
-          succ_edge->outp[BELOW] = edge->outp[ABOVE];
-          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
-          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          prev_edge = edge->prev;
-          if (prev_edge) {
-            prev_edge->next = succ_edge;
-          } else {
-            aet = succ_edge;
-          }
-          if (next_edge) {
-            next_edge->prev = succ_edge;
-          }
-          succ_edge->prev = prev_edge;
-          succ_edge->next = next_edge;
-        } else {
-          /* Update this edge */
-          edge->outp[BELOW] = edge->outp[ABOVE];
-          edge->bstate[BELOW] = edge->bstate[ABOVE];
-          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          edge->xb = edge->xt;
-        }
-        edge->outp[ABOVE] = NULL;
-      }
-    }
-  } /* === END OF SCANBEAM PROCESSING ================================== */
-  // Generate result polygon from out_poly
-  result->contour = NULL;
-  result->hole = NULL;
-  result->num_contours = count_contours(out_poly);
-  if (result->num_contours > 0) {
-    gpc_malloc<int>(result->hole, result->num_contours * sizeof(int),
-                    const_cast<char *>("hole flag table creation"));
-    gpc_malloc<gpc_vertex_list>(result->contour,
-                                result->num_contours * sizeof(gpc_vertex_list),
-                                const_cast<char *>("contour creation"));
-
-    c = 0;
-    for (poly = out_poly; poly; poly = npoly) {
-      npoly = poly->next;
-      if (poly->active) {
-        result->hole[c] = poly->proxy->hole;
-        result->contour[c].num_vertices = poly->active;
-        gpc_malloc<gpc_vertex>(
-            result->contour[c].vertex,
-            result->contour[c].num_vertices * sizeof(gpc_vertex),
-            const_cast<char *>("vertex creation"));
-
-        v = result->contour[c].num_vertices - 1;
-        for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) {
-          nv = vtx->next;
-          result->contour[c].vertex[v].x = vtx->x;
-          result->contour[c].vertex[v].y = vtx->y;
-          gpc_free<vertex_node>(vtx);
-          v--;
-        }
-        c++;
-      }
-      gpc_free<polygon_node>(poly);
-    }
-  } else {
-    for (poly = out_poly; poly; poly = npoly) {
-      npoly = poly->next;
-      gpc_free<polygon_node>(poly);
-    }
-  }
-
-  // Tidy up
-  reset_it(&it);
-  reset_lmt(&lmt);
-  gpc_free<edge_node>(c_heap);
-  gpc_free<edge_node>(s_heap);
-  gpc_free<double>(sbt);
-}  // NOLINT
-
-void gpc_free_tristrip(gpc_tristrip *t) {
-  int s = 0;
-  for (s = 0; s < t->num_strips; s++) {
-    gpc_free<gpc_vertex>(t->strip[s].vertex);
-  }
-  gpc_free<gpc_vertex_list>(t->strip);
-  t->num_strips = 0;
-}
-
-void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) {
-  gpc_polygon c;
-  c.num_contours = 0;
-  c.hole = NULL;
-  c.contour = NULL;
-  gpc_tristrip_clip(GPC_DIFF, s, &c, t);
-}
-
-// gpc_tristrip_clip
-void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
-                       gpc_tristrip *result) {
-  sb_tree *sbtree = NULL;
-  it_node *it = NULL;
-  it_node *intersect = NULL;
-  edge_node *edge = NULL;
-  edge_node *prev_edge = NULL;
-  edge_node *next_edge = NULL;
-  edge_node *succ_edge = NULL;
-  edge_node *e0 = NULL;
-  edge_node *e1 = NULL;
-  edge_node *aet = NULL;
-  edge_node *c_heap = NULL;
-  edge_node *s_heap = NULL;
-  edge_node *cf = NULL;
-  lmt_node *lmt = NULL;
-  lmt_node *local_min = NULL;
-  polygon_node *tlist = NULL;
-  polygon_node *tn = NULL;
-  polygon_node *tnn = NULL;
-  polygon_node *p = NULL;
-  polygon_node *q = NULL;
-  vertex_node *lt = NULL;
-  vertex_node *ltn = NULL;
-  vertex_node *rt = NULL;
-  vertex_node *rtn = NULL;
-  h_state horiz[2];
-  vertex_type cft = NUL;
-  int in[2];
-  int exists[2];
-  int parity[2] = {LEFT, LEFT};
-  int s = 0;
-  int v = 0;
-  int contributing = 0;
-  int search = 0;
-  int scanbeam = 0;
-  int sbt_entries = 0;
-  int vclass = 0;
-  int bl = 0;
-  int br = 0;
-  int tl = 0;
-  int tr = 0;
-  double *sbt = NULL;
-  double xb = 0.0;
-  double px = 0.0;
-  double nx = 0.0;
-  double yb = 0.0;
-  double yt = 0.0;
-  double dy = 0.0;
-  double ix = 0.0;
-  double iy = 0.0;
-
-  /* Test for trivial NULL result cases */
-  if (((subj->num_contours == 0) && (clip->num_contours == 0)) ||
-      ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) ||
-      ((clip->num_contours == 0) && (op == GPC_INT))) {
-    result->num_strips = 0;
-    result->strip = NULL;
-    return;
-  }
-
-  /* Identify potentialy contributing contours */
-  if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) &&
-      (clip->num_contours > 0)) {
-    minimax_test(subj, clip, op);
-  }
-  /* Build LMT */
-  if (subj->num_contours > 0) {
-    s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op);
-  }
-  if (clip->num_contours > 0) {
-    c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op);
-  }
-  /* Return a NULL result if no contours contribute */
-  if (lmt == NULL) {
-    result->num_strips = 0;
-    result->strip = NULL;
-    reset_lmt(&lmt);
-    gpc_free<edge_node>(s_heap);
-    gpc_free<edge_node>(c_heap);
-    return;
-  }
-
-  /* Build scanbeam table from scanbeam tree */
-  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
-                     const_cast<char *>("sbt creation"));
-  build_sbt(&scanbeam, sbt, sbtree);
-  scanbeam = 0;
-  free_sbtree(&sbtree);
-
-  /* Invert clip polygon for difference operation */
-  if (op == GPC_DIFF) {
-    parity[CLIP] = RIGHT;
-  }
-  local_min = lmt;
-
-  // Process each scanbeam
-  while (scanbeam < sbt_entries) {
-    /* Set yb and yt to the bottom and top of the scanbeam */
-    yb = sbt[scanbeam++];
-    if (scanbeam < sbt_entries) {
-      yt = sbt[scanbeam];
-      dy = yt - yb;
-    }
-
-    /* === SCANBEAM BOUNDARY PROCESSING ================================ */
-    /* If LMT node corresponding to yb exists */
-    if (local_min) {
-      if (local_min->y == yb) {
-        /* Add edges starting at this local minimum to the AET */
-        for (edge = local_min->first_bound; edge; edge = edge->next_bound) {
-          add_edge_to_aet(&aet, edge, NULL);
-        }
-        local_min = local_min->next;
-      }
-    }
-    /* Set dummy previous x value */
-    /* Create bundles within AET */
-    px = -DBL_MAX;
-    e0 = aet;
-    e1 = aet;
-
-    /* Set up bundle fields of first edge */
-    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
-    aet->bundle[ABOVE][!aet->type] = 0;
-    aet->bstate[ABOVE] = UNBUNDLED;
-
-    for (next_edge = aet->next; next_edge; next_edge = next_edge->next) {
-      /* Set up bundle fields of next edge */
-      next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb);
-      next_edge->bundle[ABOVE][!next_edge->type] = 0;
-      next_edge->bstate[ABOVE] = UNBUNDLED;
-
-      /* Bundle edges above the scanbeam boundary if they coincide */
-      if (next_edge->bundle[ABOVE][next_edge->type]) {
-        if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) &&
-            (e0->top.y != yb)) {
-          next_edge->bundle[ABOVE][next_edge->type] ^=
-              e0->bundle[ABOVE][next_edge->type];
-          next_edge->bundle[ABOVE][!next_edge->type] =
-              e0->bundle[ABOVE][!next_edge->type];
-          next_edge->bstate[ABOVE] = BUNDLE_HEAD;
-          e0->bundle[ABOVE][CLIP] = 0;
-          e0->bundle[ABOVE][SUBJ] = 0;
-          e0->bstate[ABOVE] = BUNDLE_TAIL;
-        }
-        e0 = next_edge;
-      }
-    }
-    horiz[CLIP] = NH;
-    horiz[SUBJ] = NH;
-
-    /* Process each edge at this scanbeam boundary */
-    for (edge = aet; edge; edge = edge->next) {
-      exists[CLIP] =
-          edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1);
-      exists[SUBJ] =
-          edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1);
-
-      if (exists[CLIP] || exists[SUBJ]) {
-        /* Set bundle side */
-        edge->bside[CLIP] = parity[CLIP];
-        edge->bside[SUBJ] = parity[SUBJ];
-
-        /* Determine contributing status and quadrant occupancies */
-        switch (op) {
-          case GPC_DIFF:
-          case GPC_INT:
-            contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) && (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) &&
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) &&
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_XOR:
-            contributing = exists[CLIP] || exists[SUBJ];
-            br = (parity[CLIP]) ^ (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ^
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-          case GPC_UNION:
-            contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) ||
-                           (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) ||
-                           (exists[CLIP] && exists[SUBJ] &&
-                            (parity[CLIP] == parity[SUBJ]));
-            br = (parity[CLIP]) || (parity[SUBJ]);
-            bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ||
-                 (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]);
-            tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH));
-            tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^
-                  edge->bundle[BELOW][CLIP]) ||
-                 (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^
-                  edge->bundle[BELOW][SUBJ]);
-            break;
-        }
-
-        // Update parity
-        parity[CLIP] ^= edge->bundle[ABOVE][CLIP];
-        parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ];
-
-        /* Update horizontal state */
-        if (exists[CLIP]) {
-          horiz[CLIP] = next_h_state[horiz[CLIP]]
-                                    [((exists[CLIP] - 1) << 1) + parity[CLIP]];
-        }
-        if (exists[SUBJ]) {
-          horiz[SUBJ] = next_h_state[horiz[SUBJ]]
-                                    [((exists[SUBJ] - 1) << 1) + parity[SUBJ]];
-        }
-        vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-
-        if (contributing) {
-          xb = edge->xb;
-          switch (vclass) {
-            case EMN:
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              break;
-            case ERI:
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (xb != cf->xb) {
-                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              }
-              cf = NULL;
-              break;
-            case ELI:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              cf = edge;
-              break;
-            case EMX:
-              if (xb != cf->xb) {
-                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              }
-              edge->outp[ABOVE] = NULL;
-              cf = NULL;
-              break;
-            case IMN:
-              if (cft == LED) {
-                if (cf->bot.y != yb) {
-                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                }
-                new_tristrip(&tlist, cf, cf->xb, yb);
-              }
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              break;
-            case ILI:
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              cft = ILI;
-              break;
-            case IRI:
-              if (cft == LED) {
-                if (cf->bot.y != yb) {
-                  gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                }
-                new_tristrip(&tlist, cf, cf->xb, yb);
-              }
-              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              break;
-            case IMX:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              cft = IMX;
-              break;
-            case IMM:
-              gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (xb != cf->xb) {
-                gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb);
-              }
-              cf = edge;
-              break;
-            case EMM:
-              gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-              edge->outp[ABOVE] = NULL;
-              new_tristrip(&tlist, edge, xb, yb);
-              cf = edge;
-              break;
-            case LED:
-              if (edge->bot.y == yb) {
-                gpc_vertex_create(edge, BELOW, LEFT, xb, yb);
-              }
-              edge->outp[ABOVE] = edge->outp[BELOW];
-              cf = edge;
-              cft = LED;
-              break;
-            case RED:
-              edge->outp[ABOVE] = cf->outp[ABOVE];
-              if (cft == LED) {
-                if (cf->bot.y == yb) {
-                  gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                } else {
-                  if (edge->bot.y == yb) {
-                    gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb);
-                    gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                  }
-                }
-              } else {
-                gpc_vertex_create(edge, BELOW, RIGHT, xb, yb);
-                gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb);
-              }
-              cf = NULL;
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing conditional */
-      }     /* End of edge exists conditional */
-    }       // End of AET loop
-
-    /* Delete terminating edges from the AET, otherwise compute xt */
-    for (edge = aet; edge; edge = edge->next) {
-      if (edge->top.y == yb) {
-        prev_edge = edge->prev;
-        next_edge = edge->next;
-        if (prev_edge) {
-          prev_edge->next = next_edge;
-        } else {
-          aet = next_edge;
-        }
-        if (next_edge) {
-          next_edge->prev = prev_edge;
-        }
-
-        /* Copy bundle head state to the adjacent tail edge if required */
-        if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) {
-          if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) {
-            prev_edge->outp[BELOW] = edge->outp[BELOW];
-            prev_edge->bstate[BELOW] = UNBUNDLED;
-            if (prev_edge->prev) {
-              if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) {
-                prev_edge->bstate[BELOW] = BUNDLE_HEAD;
-              }
-            }
-          }
-        }
-      } else {
-        if (edge->top.y == yt) {
-          edge->xt = edge->top.x;
-        } else {
-          edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y);
-        }
-      }
-    }
-
-    if (scanbeam < sbt_entries) {
-      /* === SCANBEAM INTERIOR PROCESSING ============================== */
-      build_intersection_table(&it, aet, dy);
-      /* Process each node in the intersection table */
-      for (intersect = it; intersect; intersect = intersect->next) {
-        e0 = intersect->ie[0];
-        e1 = intersect->ie[1];
-
-        /* Only generate output for contributing intersections */
-        if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) &&
-            (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) {
-          p = e0->outp[ABOVE];
-          q = e1->outp[ABOVE];
-          ix = intersect->point.x;
-          iy = intersect->point.y + yb;
-
-          in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) ||
-                     (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) ||
-                     (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] &&
-                      e0->bside[CLIP] && e1->bside[CLIP]);
-          in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) ||
-                     (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) ||
-                     (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] &&
-                      e0->bside[SUBJ] && e1->bside[SUBJ]);
-
-          switch (op) {  // Determine quadrant occupancies
-            case GPC_DIFF:
-            case GPC_INT:
-              tr = (in[CLIP]) && (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) &&
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_XOR:
-              tr = (in[CLIP]) ^ (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ^
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-            case GPC_UNION:
-              tr = (in[CLIP]) || (in[SUBJ]);
-              tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]);
-              br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]);
-              bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^
-                    e0->bundle[ABOVE][CLIP]) ||
-                   (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^
-                    e0->bundle[ABOVE][SUBJ]);
-              break;
-          }
-
-          vclass = tr + (tl << 1) + (br << 2) + (bl << 3);
-          switch (vclass) {
-            case EMN:
-              new_tristrip(&tlist, e1, ix, iy);
-              e0->outp[ABOVE] = e1->outp[ABOVE];
-              break;
-            case ERI:
-              if (p) {
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case ELI:
-              if (q) {
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case EMX:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMN:
-              gpc_p_edge(prev_edge, e0, ABOVE);
-              gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-              gpc_n_edge(next_edge, e1, ABOVE);
-              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              new_tristrip(&tlist, prev_edge, px, iy);
-              e1->outp[ABOVE] = prev_edge->outp[ABOVE];
-              gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-              new_tristrip(&tlist, e0, ix, iy);
-              next_edge->outp[ABOVE] = e0->outp[ABOVE];
-              gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              break;
-            case ILI:
-              if (p) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e1->outp[ABOVE] = e0->outp[ABOVE];
-                e0->outp[ABOVE] = NULL;
-              }
-              break;
-            case IRI:
-              if (q) {
-                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-                e1->outp[ABOVE] = NULL;
-              }
-              break;
-            case IMX:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                e0->outp[ABOVE] = NULL;
-                e1->outp[ABOVE] = NULL;
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                new_tristrip(&tlist, prev_edge, px, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                next_edge->outp[ABOVE] = prev_edge->outp[ABOVE];
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              }
-              break;
-            case IMM:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy);
-                gpc_vertex_create(e1, ABOVE, LEFT, ix, iy);
-                gpc_p_edge(prev_edge, e0, ABOVE);
-                gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy);
-                new_tristrip(&tlist, prev_edge, px, iy);
-                gpc_n_edge(next_edge, e1, ABOVE);
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-                e1->outp[ABOVE] = prev_edge->outp[ABOVE];
-                gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy);
-                new_tristrip(&tlist, e0, ix, iy);
-                next_edge->outp[ABOVE] = e0->outp[ABOVE];
-                gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy);
-              }
-              break;
-            case EMM:
-              if (p && q) {
-                gpc_vertex_create(e0, ABOVE, LEFT, ix, iy);
-                new_tristrip(&tlist, e1, ix, iy);
-                e0->outp[ABOVE] = e1->outp[ABOVE];
-              }
-              break;
-            default:
-              break;
-          } /* End of switch */
-        }   /* End of contributing intersection conditional */
-
-        // Swap bundle sides in response to edge crossing
-        if (e0->bundle[ABOVE][CLIP]) {
-          e1->bside[CLIP] = !e1->bside[CLIP];
-        }
-        if (e1->bundle[ABOVE][CLIP]) {
-          e0->bside[CLIP] = !e0->bside[CLIP];
-        }
-        if (e0->bundle[ABOVE][SUBJ]) {
-          e1->bside[SUBJ] = !e1->bside[SUBJ];
-        }
-        if (e1->bundle[ABOVE][SUBJ]) {
-          e0->bside[SUBJ] = !e0->bside[SUBJ];
-        }
-
-        /* Swap e0 and e1 bundles in the AET */
-        prev_edge = e0->prev;
-        next_edge = e1->next;
-        if (e1->next) {
-          e1->next->prev = e0;
-        }
-
-        if (e0->bstate[ABOVE] == BUNDLE_HEAD) {
-          search = 1;
-          while (search) {
-            prev_edge = prev_edge->prev;
-            if (prev_edge) {
-              if (prev_edge->bundle[ABOVE][CLIP] ||
-                  prev_edge->bundle[ABOVE][SUBJ] ||
-                  (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) {
-                search = 0;
-              }
-            } else {
-              search = 0;
-            }
-          }
-        }
-        if (!prev_edge) {
-          e1->next = aet;
-          aet = e0->next;
-        } else {
-          e1->next = prev_edge->next;
-          prev_edge->next = e0->next;
-        }
-        e0->next->prev = prev_edge;
-        e1->next->prev = e1;
-        e0->next = next_edge;
-      } /* End of IT loop*/
-
-      /* Prepare for next scanbeam */
-      for (edge = aet; edge; edge = next_edge) {
-        next_edge = edge->next;
-        succ_edge = edge->succ;
-
-        if ((edge->top.y == yt) && succ_edge) {
-          /* Replace AET edge by its successor */
-          succ_edge->outp[BELOW] = edge->outp[ABOVE];
-          succ_edge->bstate[BELOW] = edge->bstate[ABOVE];
-          succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          prev_edge = edge->prev;
-          if (prev_edge) {
-            prev_edge->next = succ_edge;
-          } else {
-            aet = succ_edge;
-          }
-          if (next_edge) {
-            next_edge->prev = succ_edge;
-          }
-          succ_edge->prev = prev_edge;
-          succ_edge->next = next_edge;
-        } else {
-          /* Update this edge */
-          edge->outp[BELOW] = edge->outp[ABOVE];
-          edge->bstate[BELOW] = edge->bstate[ABOVE];
-          edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP];
-          edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ];
-          edge->xb = edge->xt;
-        }
-        edge->outp[ABOVE] = NULL;
-      }
-    }
-  } /* === END OF SCANBEAM PROCESSING ================================== */
-
-  // Generate result tristrip from tlist
-  result->strip = NULL;
-  result->num_strips = count_tristrips(tlist);
-  if (result->num_strips > 0) {
-    gpc_malloc<gpc_vertex_list>(result->strip,
-                                result->num_strips * sizeof(gpc_vertex_list),
-                                const_cast<char *>("tristrip list creation"));
-
-    s = 0;
-    for (tn = tlist; tn; tn = tnn) {
-      tnn = tn->next;
-      if (tn->active > 2) {
-        /* Valid tristrip: copy the vertices and free the heap */
-        result->strip[s].num_vertices = tn->active;
-        gpc_malloc<gpc_vertex>(result->strip[s].vertex,
-                               tn->active * sizeof(gpc_vertex),
-                               const_cast<char *>("tristrip creation"));
-        v = 0;
-        if (0) {
-          lt = tn->v[RIGHT];
-          rt = tn->v[LEFT];
-        } else {
-          lt = tn->v[LEFT];
-          rt = tn->v[RIGHT];
-        }
-        while (lt || rt) {
-          if (lt) {
-            ltn = lt->next;
-            result->strip[s].vertex[v].x = lt->x;
-            result->strip[s].vertex[v].y = lt->y;
-            v++;
-            gpc_free<vertex_node>(lt);
-            lt = ltn;
-          }
-          if (rt) {
-            rtn = rt->next;
-            result->strip[s].vertex[v].x = rt->x;
-            result->strip[s].vertex[v].y = rt->y;
-            v++;
-            gpc_free<vertex_node>(rt);
-            rt = rtn;
-          }
-        }
-        s++;
-      } else {
-        /* Invalid tristrip: just free the heap */
-        for (lt = tn->v[LEFT]; lt; lt = ltn) {
-          ltn = lt->next;
-          gpc_free<vertex_node>(lt);
-        }
-        for (rt = tn->v[RIGHT]; rt; rt = rtn) {
-          rtn = rt->next;
-          gpc_free<vertex_node>(rt);
-        }
-      }
-      gpc_free<polygon_node>(tn);
-    }
-  }
-  // Tidy up
-  reset_it(&it);
-  reset_lmt(&lmt);
-  gpc_free<edge_node>(c_heap);
-  gpc_free<edge_node>(s_heap);
-  gpc_free<double>(sbt);
-}  // NOLINT
-
-}  // namespace gpc
-
-#endif
diff --git a/mobile/src/operators/math/gpc.h b/mobile/src/operators/math/gpc.h
deleted file mode 100644
index 2cae7fe18458ee6f42f3cc6f374982214f041f84..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gpc.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-#pragma once
-
-#include <float.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace gpc {
-
-typedef enum {  // Set operation type
-  GPC_DIFF,     // Difference
-  GPC_INT,      // Intersection
-  GPC_XOR,      // Exclusive or
-  GPC_UNION     // Union
-} gpc_op;
-
-typedef struct {  // Polygon vertex structure
-  double x;       // Vertex x component
-  double y;       // vertex y component
-} gpc_vertex;
-
-typedef struct {       // Vertex list structure
-  int num_vertices;    // Number of vertices in list
-  gpc_vertex *vertex;  // Vertex array pointer
-} gpc_vertex_list;
-
-typedef struct {             // Polygon set structure
-  int num_contours;          // Number of contours in polygon
-  int *hole;                 // Hole  external contour flags
-  gpc_vertex_list *contour;  // Contour array pointer
-} gpc_polygon;
-
-typedef struct {           // Tristrip set structure
-  int num_strips;          // Number of tristrips
-  gpc_vertex_list *strip;  // Tristrip array pointer
-} gpc_tristrip;
-
-typedef enum { LEFT, RIGHT } gpc_left_right;
-
-typedef enum { ABOVE, BELOW } gpc_above_below;
-
-typedef enum { CLIP, SUBJ } gpc_clip_subj;
-
-typedef enum {      /* Edge intersection classes         */
-               NUL, /* Empty non-intersection            */
-               EMX, /* External maximum                  */
-               ELI, /* External left intermediate        */
-               TED, /* Top edge                          */
-               ERI, /* External right intermediate       */
-               RED, /* Right edge                        */
-               IMM, /* Internal maximum and minimum      */
-               IMN, /* Internal minimum                  */
-               EMN, /* External minimum                  */
-               EMM, /* External maximum and minimum      */
-               LED, /* Left edge                         */
-               ILI, /* Internal left intermediate        */
-               BED, /* Bottom edge                       */
-               IRI, /* Internal right intermediate       */
-               IMX, /* Internal maximum                  */
-               FUL  /* Full non-intersection             */
-} vertex_type;
-
-typedef enum {     /* Horizontal edge states            */
-               NH, /* No horizontal edge                */
-               BH, /* Bottom horizontal edge            */
-               TH  /* Top horizontal edge               */
-} h_state;
-
-typedef enum {              /* Edge bundle state                 */
-               UNBUNDLED,   /* Isolated edge not within a bundle */
-               BUNDLE_HEAD, /* Bundle head node                  */
-               BUNDLE_TAIL  /* Passive bundle tail node          */
-} bundle_state;
-
-typedef struct v_shape { /* Internal vertex list datatype     */
-  double x;              /* X coordinate component            */
-  double y;              /* Y coordinate component            */
-  struct v_shape *next;  /* Pointer to next vertex in list    */
-} vertex_node;
-
-typedef struct p_shape { /* Internal contour / tristrip type  */
-  int active;            /* Active flag / vertex count        */
-  int hole;              /* Hole / external contour flag      */
-  vertex_node *v[2];     /* Left and right vertex list ptrs   */
-  struct p_shape *next;  /* Pointer to next polygon contour   */
-  struct p_shape *proxy; /* Pointer to actual structure used  */
-} polygon_node;
-
-typedef struct edge_shape {
-  gpc_vertex vertex;             /* Piggy-backed contour vertex data  */
-  gpc_vertex bot;                /* Edge lower (x, y) coordinate      */
-  gpc_vertex top;                /* Edge upper (x, y) coordinate      */
-  double xb;                     /* Scanbeam bottom x coordinate      */
-  double xt;                     /* Scanbeam top x coordinate         */
-  double dx;                     /* Change in x for a unit y increase */
-  int type;                      /* Clip / subject edge flag          */
-  int bundle[2][2];              /* Bundle edge flags                 */
-  int bside[2];                  /* Bundle left / right indicators    */
-  bundle_state bstate[2];        /* Edge bundle state                 */
-  polygon_node *outp[2];         /* Output polygon / tristrip pointer */
-  struct edge_shape *prev;       /* Previous edge in the AET          */
-  struct edge_shape *next;       /* Next edge in the AET              */
-  struct edge_shape *pred;       /* Edge connected at the lower end   */
-  struct edge_shape *succ;       /* Edge connected at the upper end   */
-  struct edge_shape *next_bound; /* Pointer to next bound in LMT      */
-} edge_node;
-
-inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); }
-
-inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); }
-
-inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); }
-
-inline int gpc_next_index(int i, int n) { return ((i + 1) % n); }
-
-inline int gpc_optimal(gpc_vertex *v, int i, int n) {
-  return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y);
-}
-
-inline int gpc_fwd_min(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y > v[i].vertex.y &&
-          v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y);
-}
-
-inline int gpc_not_fmax(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y > v[i].vertex.y);
-}
-
-inline int gpc_rev_min(edge_node *v, int i, int n) {
-  return (v[(i + 1) % n].vertex.y >= v[i].vertex.y &&
-          v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
-}
-
-inline int gpc_not_rmax(edge_node *v, int i, int n) {
-  return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y);
-}
-
-// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j)
-// {
-inline void gpc_p_edge(edge_node *d, edge_node *e, int p) {
-  d = e;
-  do {
-    d = d->prev;
-  } while (!d->outp[p]);
-  // i = d->bot.x + d->dx * (j - d->bot.y);
-}
-
-// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j)
-// {
-inline void gpc_n_edge(edge_node *d, edge_node *e, int p) {
-  d = e;
-  do {
-    d = d->next;
-  } while (!d->outp[p]);
-  // i = d->bot.x + d->dx * (j - d->bot.y);
-}
-
-template <typename T>
-void gpc_malloc(T *&p, int b, char *s) {  // NOLINT
-  if (b > 0) {
-    p = reinterpret_cast<T *>(malloc(b));
-
-    if (!p) {
-      fprintf(stderr, "gpc malloc failure: %s\n", s);
-      exit(0);
-    }
-  } else {
-    p = NULL;
-  }
-}
-
-template <typename T>
-void gpc_free(T *&p) {  // NOLINT
-  if (p) {
-    free(p);
-    p = NULL;
-  }
-}
-
-/*
-===========================================================================
-                       Public Function Prototypes
-===========================================================================
-*/
-
-void add_vertex(vertex_node **t, double x, double y);
-
-void gpc_vertex_create(edge_node *e, int p, int s, double x, double y);
-
-void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole);
-
-void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
-                      gpc_polygon *clip_polygon, gpc_polygon *result_polygon);
-
-void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon,
-                       gpc_polygon *clip_polygon,
-                       gpc_tristrip *result_tristrip);
-
-void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip);
-
-void gpc_free_polygon(gpc_polygon *polygon);
-
-void gpc_free_tristrip(gpc_tristrip *tristrip);
-
-}  // namespace gpc
-
-#endif
diff --git a/mobile/src/operators/math/gru_compute.cpp b/mobile/src/operators/math/gru_compute.cpp
deleted file mode 100644
index d30ea5aa47141d2f4398a36ce6ec7a8885110196..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gru_compute.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#include "operators/math/gru_compute.h"
-#include "common/types.h"
-#include "operators/math/activation.h"
-#include "operators/math/gemm/cblas.h"
-#include "operators/math/gru_cpu_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUUnitFunctor<CPU, T> {
-  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const ActivationType active_node,
-                      const ActivationType active_gate) {
-    if (value.prev_out_value) {
-      cblas_sgemm(false, false, batch_size, frame_size * 2, frame_size, 1.f,
-                  value.prev_out_value, frame_size, value.gate_weight,
-                  frame_size * 2, 1.f, value.gate_value, frame_size * 3);
-    }
-
-    forward_reset_output(value, frame_size, batch_size, active_gate);
-
-    if (value.prev_out_value) {
-      cblas_sgemm(false, false, batch_size, frame_size, frame_size, 1.f,
-                  value.reset_output_value, frame_size, value.state_weight,
-                  frame_size, 1.f, value.gate_value + frame_size * 2,
-                  frame_size * 3);
-    }
-
-    forward_final_output(value, frame_size, batch_size, active_node);
-  }
-};
-
-template struct GRUUnitFunctor<CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/math/gru_compute.h b/mobile/src/operators/math/gru_compute.h
deleted file mode 100644
index 00f4da90222f4f9d492a8214ee37828aac7aaf2d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gru_compute.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef GRU_OP
-#pragma once
-
-#include "operators/math/activation.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-struct GRUMetaValue {
-  T *gate_weight;
-  T *state_weight;
-  T *gate_value;
-  T *reset_output_value;
-  T *output_value;
-  T *prev_out_value;
-};
-
-template <typename DeviceType, typename T>
-struct GRUUnitFunctor {
-  static void compute(GRUMetaValue<T> value, int frame_size, int batch_size,
-                      const ActivationType active_node,
-                      const ActivationType active_gate);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/operators/math/gru_cpu_kernel.h b/mobile/src/operators/math/gru_cpu_kernel.h
deleted file mode 100644
index a010fb616b2c222e1ab9c7bfb248aad35d9b0e97..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/gru_cpu_kernel.h
+++ /dev/null
@@ -1,203 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef GRU_OP
-
-#pragma once
-
-#include <type_traits>
-#include "operators/math/activation.h"
-#include "operators/math/gru_compute.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T, ActivationType Act>
-void hl_naive_gru_forward_reset_output(T *gate_value, T *reset_output_value,
-                                       T *prev_output_value, int frame_size) {
-  T r_value_update_gate;
-  T r_value_reset_gate;
-  T r_value_reset_output;
-  T r_prev_out = 0;
-  T *update_gate = gate_value;
-  T *reset_gate = gate_value + frame_size;
-
-  int remain = frame_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  int loop = remain >> 3;
-  remain = remain & 0x7;
-  float32x4_t prev0 = vdupq_n_f32(0.f);
-  float32x4_t prev1 = vdupq_n_f32(0.f);
-  for (int i = 0; i < loop; ++i) {
-    float32x4_t update0 = vld1q_f32(update_gate);
-    float32x4_t update1 = vld1q_f32(update_gate + 4);
-    float32x4_t reset0 = vld1q_f32(reset_gate);
-    float32x4_t reset1 = vld1q_f32(reset_gate + 4);
-    if (prev_output_value) {
-      prev0 = vld1q_f32(prev_output_value);
-      prev1 = vld1q_f32(prev_output_value + 4);
-      prev_output_value += 8;
-    }
-    update0 = vActiveq_f32<Act>(update0);
-    update1 = vActiveq_f32<Act>(update1);
-    reset0 = vActiveq_f32<Act>(reset0);
-    reset1 = vActiveq_f32<Act>(reset1);
-    float32x4_t output0 = vmulq_f32(prev0, reset0);
-    float32x4_t output1 = vmulq_f32(prev1, reset1);
-    vst1q_f32(update_gate, update0);
-    vst1q_f32(update_gate + 4, update1);
-    vst1q_f32(reset_gate, reset0);
-    vst1q_f32(reset_gate + 4, reset1);
-    vst1q_f32(reset_output_value, output0);
-    vst1q_f32(reset_output_value + 4, output1);
-    update_gate += 8;
-    reset_gate += 8;
-    reset_output_value += 8;
-  }
-#endif  // __ARM_NEON__
-  for (int i = 0; i < remain; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_reset_gate = reset_gate[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-    r_value_update_gate = Active<Act>(r_value_update_gate);
-    r_value_reset_gate = Active<Act>(r_value_reset_gate);
-    r_value_reset_output = r_prev_out * r_value_reset_gate;
-    update_gate[i] = r_value_update_gate;
-    reset_gate[i] = r_value_reset_gate;
-    reset_output_value[i] = r_value_reset_output;
-  }
-}
-
-template <typename T, ActivationType Act>
-void hl_naive_gru_forward_final_output(T *gate_value, T *prev_output_value,
-                                       T *output_value, int frame_size) {
-  T r_value_update_gate;
-  T r_value_frame_state;
-  T r_prev_out = 0;
-  T r_output;
-  T *update_gate = gate_value;
-  T *frame_state = gate_value + frame_size * 2;
-
-  int remain = frame_size;
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-  int loop = remain >> 3;
-  remain = remain & 0x7;
-  float32x4_t prev0 = vdupq_n_f32(0.f);
-  float32x4_t prev1 = vdupq_n_f32(0.f);
-  for (int i = 0; i < loop; ++i) {
-    float32x4_t update0 = vld1q_f32(update_gate);
-    float32x4_t update1 = vld1q_f32(update_gate + 4);
-    float32x4_t state0 = vld1q_f32(frame_state);
-    float32x4_t state1 = vld1q_f32(frame_state + 4);
-    if (prev_output_value) {
-      prev0 = vld1q_f32(prev_output_value);
-      prev1 = vld1q_f32(prev_output_value + 4);
-      prev_output_value += 8;
-    }
-    state0 = vActiveq_f32<Act>(state0);
-    state1 = vActiveq_f32<Act>(state1);
-    float32x4_t output0 = vmlsq_f32(prev0, update0, prev0);
-    float32x4_t output1 = vmlsq_f32(prev1, update1, prev1);
-    output0 = vmlaq_f32(output0, update0, state0);
-    output1 = vmlaq_f32(output1, update1, state1);
-    vst1q_f32(frame_state, state0);
-    vst1q_f32(frame_state + 4, state1);
-    vst1q_f32(output_value, output0);
-    vst1q_f32(output_value + 4, output1);
-    update_gate += 8;
-    frame_state += 8;
-    output_value += 8;
-  }
-#endif  // __ARM_NEON__
-  for (int i = 0; i < remain; i++) {
-    r_value_update_gate = update_gate[i];
-    r_value_frame_state = frame_state[i];
-    if (prev_output_value) {
-      r_prev_out = prev_output_value[i];
-    }
-    r_value_frame_state = Active<Act>(r_value_frame_state);
-    r_output = r_prev_out - r_value_update_gate * r_prev_out +
-               r_value_update_gate * r_value_frame_state;
-    frame_state[i] = r_value_frame_state;
-    output_value[i] = r_output;
-  }
-}
-
-#define FORWARD_RESET_OUTPUT(active_type, value, frame_size)            \
-  hl_naive_gru_forward_reset_output<float, active_type>(                \
-      value.gate_value, value.reset_output_value, value.prev_out_value, \
-      frame_size);
-
-template <typename T>
-inline void forward_reset_output(GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; ++b) {
-    switch (active_node) {
-      case RELU:
-        FORWARD_RESET_OUTPUT(RELU, value, frame_size);
-        break;
-      case SIGMOID:
-        FORWARD_RESET_OUTPUT(SIGMOID, value, frame_size);
-        break;
-      case TANH:
-        FORWARD_RESET_OUTPUT(TANH, value, frame_size);
-        break;
-      default:
-        FORWARD_RESET_OUTPUT(IDENTITY, value, frame_size);
-    }
-    value.gate_value += frame_size * 3;
-    value.reset_output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-#define FORWARD_FINAL_OUTPUT(active_type, value, frame_size) \
-  hl_naive_gru_forward_final_output<float, active_type>(     \
-      value.gate_value, value.prev_out_value, value.output_value, frame_size)
-
-template <typename T>
-inline void forward_final_output(GRUMetaValue<T> value, int frame_size,
-                                 int batch_size, ActivationType active_node) {
-  for (int b = 0; b < batch_size; ++b) {
-    switch (active_node) {
-      case RELU:
-        FORWARD_FINAL_OUTPUT(RELU, value, frame_size);
-        break;
-      case SIGMOID:
-        FORWARD_FINAL_OUTPUT(SIGMOID, value, frame_size);
-        break;
-      case TANH:
-        FORWARD_FINAL_OUTPUT(TANH, value, frame_size);
-        break;
-      default:
-        FORWARD_FINAL_OUTPUT(IDENTITY, value, frame_size);
-    }
-    value.gate_value += frame_size * 3;
-    value.output_value += frame_size;
-    if (value.prev_out_value) {
-      value.prev_out_value += frame_size;
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/im2col.cpp b/mobile/src/operators/math/im2col.cpp
deleted file mode 100644
index a7b97e5bfca6c5a9753d6a9e664bc4a8ee5450f6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/im2col.cpp
+++ /dev/null
@@ -1,668 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-#include <algorithm>
-#include "common/types.h"
-#include "operators/math/im2col.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void ExtractToImg<float>(const float *im_data, float *col_data,
-                         const int im_height, const int im_width,
-                         const int col_height, const int col_width,
-                         const int padding_h, const int padding_w,
-                         const int stride_h, const int stride_w, const int kh,
-                         const int kw) {
-  int h = padding_h - kh;
-  int w = padding_w - kw;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int start_width = kw + col_start_width * stride_w - padding_w;
-
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  int end_width = (col_width - col_start_width) * stride_w + start_width;
-  end_width = end_width > im_width ? im_width : end_width;
-  int extract = (end_width - start_width + stride_w - 1) / stride_w;
-
-  im_data += start_height * im_width + start_width;
-  col_data += col_start_height * col_width + col_start_width;
-  for (int i = start_height; i < end_height; i += stride_h) {
-    int s = 0;
-    if (stride_w == 1) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4_t _img = vld1q_f32(im_data + s);
-        vst1q_f32(col_data + s, _img);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s];
-      }
-    } else if (stride_w == 2) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4x2_t _img = vld2q_f32(im_data + s * 2);
-        vst1q_f32(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 2];
-      }
-    } else if (stride_w == 3) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4x3_t _img = vld3q_f32(im_data + s * 3);
-        vst1q_f32(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 3];
-      }
-    } else if (stride_w == 4) {
-#if __ARM_NEON
-      for (; s < extract - 3; s += 4) {
-        float32x4x4_t _img = vld4q_f32(im_data + s * 4);
-        vst1q_f32(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 4];
-      }
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
-    }
-    im_data += im_width * stride_h;
-    col_data += col_width;
-  }
-}
-
-template <>
-void ExtractToImg<int8_t>(const int8_t *im_data, int8_t *col_data,
-                          const int im_height, const int im_width,
-                          const int col_height, const int col_width,
-                          const int padding_h, const int padding_w,
-                          const int stride_h, const int stride_w, const int kh,
-                          const int kw) {
-  int h = padding_h - kh;
-  int w = padding_w - kw;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int start_width = kw + col_start_width * stride_w - padding_w;
-
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  int end_width = (col_width - col_start_width) * stride_w + start_width;
-  end_width = end_width > im_width ? im_width : end_width;
-  int extract = (end_width - start_width + stride_w - 1) / stride_w;
-
-  im_data += start_height * im_width + start_width;
-  col_data += col_start_height * col_width + col_start_width;
-  for (int i = start_height; i < end_height; i += stride_h) {
-    int s = 0;
-    if (stride_w == 1) {
-      for (; s < extract - 15; s += 16) {
-        int8x16_t _img = vld1q_s8(im_data + s);
-        vst1q_s8(col_data + s, _img);
-      }
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s];
-      }
-    } else if (stride_w == 2) {
-#if __ARM_NEON
-      for (; s < extract - 15; s += 16) {
-        int8x16x2_t _img = vld2q_s8(im_data + s * 2);
-        vst1q_s8(col_data + s, _img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 2];
-      }
-    } else if (stride_w == 3) {
-#if __ARM_NEON
-      for (; s < extract - 15; s += 16) {
-        int8x16x3_t img = vld3q_s8(im_data + s * 3);
-        vst1q_s8(col_data + s, img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 3];
-      }
-    } else if (stride_w == 4) {
-#if __ARM_NEON
-      for (; s < extract - 15; s += 16) {
-        int8x16x4_t img = vld4q_s8(im_data + s * 4);
-        vst1q_s8(col_data + s, img.val[0]);
-      }
-#endif
-      for (; s < extract; ++s) {
-        col_data[s] = im_data[s * 4];
-      }
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1, 2, 3 and 4.");
-    }
-    im_data += im_width * stride_h;
-    col_data += col_width;
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height,
- * output_width]
- */
-template <class T>
-class Im2ColFunctor<ColFormat::kCFO, CPU, T> {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[1];
-    int filter_width = col->dims()[2];
-    int col_height = col->dims()[3];
-    int col_width = col->dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-#if __ARM_NEON
-    if (stride[0] <= 4 && dilation[0] == 1 && dilation[0] == dilation[1]) {
-      int im_spatial_size = im_height * im_width;
-      int col_spatial_size = col_height * col_width;
-      // pad 0
-      memset(col_data, 0, col->numel() * sizeof(T));
-
-      #pragma omp parallel for
-      for (int ic = 0; ic < im_channels; ++ic) {
-        const T *local_im_data = im_data + ic * im_spatial_size;
-        T *local_col_data =
-            col_data + ic * filter_height * filter_width * col_spatial_size;
-        for (int kh = 0; kh < filter_height; ++kh) {
-          for (int kw = 0; kw < filter_width; ++kw) {
-            ExtractToImg<T>(local_im_data, local_col_data, im_height, im_width,
-                            col_height, col_width, padding[0], padding[1],
-                            stride[0], stride[1], kh, kw);
-            local_col_data += col_spatial_size;
-          }
-        }
-      }
-    } else {
-#endif
-      for (int c = 0; c < channels_col; ++c) {
-        int w_offset = c % filter_width;
-        int h_offset = (c / filter_width) % filter_height;
-        int c_im = c / (filter_width * filter_height);
-        for (int h = 0; h < col_height; ++h) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-          for (int w = 0; w < col_width; ++w) {
-            int im_col_idx =
-                w * stride[1] - padding[1] + w_offset * dilation[1];
-            int col_idx = (c * col_height + h) * col_width + w;
-            int im_idx =
-                (im_row_idx + c_im * im_height) * im_width + im_col_idx;
-
-            col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
-                                 im_col_idx < 0 || im_col_idx >= im_width)
-                                    ? static_cast<T>(0)
-                                    : im_data[im_idx];
-          }
-        }
-      }
-#if __ARM_NEON
-    }
-#endif
-  }
-};
-
-template <>
-void ExtendToImg<float>(const float *col_data, float *im_data,
-                        const int im_height, const int im_width,
-                        const int col_height, const int col_width,
-                        const int padding_h, const int padding_w,
-                        const int stride_h, const int stride_w, const int kh,
-                        const int kw) {
-  int h = padding_h - kh;
-  int w = padding_w - kw;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int start_width = kw + col_start_width * stride_w - padding_w;
-
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  int end_width = (col_width - col_start_width) * stride_w + start_width;
-  end_width = end_width > im_width ? im_width : end_width;
-  // int extract = (end_width - start_width + stride_w - 1) / stride_w;
-  int extend = end_width - start_width;
-
-  im_data += start_height * im_width + start_width;
-  col_data += col_start_height * col_width + col_start_width;
-
-  for (int i = start_height; i < end_height; i += stride_h) {
-    int s = 0;
-    if (stride_w == 1) {
-#if __ARM_NEON
-      for (; s < extend - 3; s += 4) {
-        float32x4_t _col = vld1q_f32(col_data + s);
-        float32x4_t _img = vld1q_f32(im_data + s);
-        _img = vaddq_f32(_img, _col);
-        vst1q_f32(im_data + s, _img);
-      }
-#endif
-      for (; s < extend; ++s) {
-        im_data[s] += col_data[s];
-      }
-    } else if (stride_w == 2) {
-#if __ARM_NEON
-      for (; s < extend - 7; s += 8) {
-        float32x4_t _col = vld1q_f32(col_data + s / 2);
-        float32x4x2_t _img = vld2q_f32(im_data + s);
-        _img.val[0] = vaddq_f32(_img.val[0], _col);
-        vst2q_f32(im_data + s, _img);
-      }
-#endif
-      for (; s < extend; s += 2) {
-        im_data[s] += col_data[s / 2];
-      }
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2.");
-    }
-    im_data += im_width * stride_h;
-    col_data += col_width;
-  }
-}
-
-template <>
-void ExtendToImgV2<float>(const float *col_data, float *im_data,
-                          const int im_height, const int im_width,
-                          const int col_height, const int col_width,
-                          const int padding_h, const int padding_w,
-                          const int stride_h, const int stride_w, const int kh,
-                          const int kernel_w) {
-  int col_spatial_size = col_height * col_width;
-  int h = padding_h - kh;
-  int col_start_height = h > 0 ? (h + stride_h - 1) / stride_h : 0;
-  int start_height = kh + col_start_height * stride_h - padding_h;
-  int end_height = (col_height - col_start_height) * stride_h + start_height;
-  end_height = end_height > im_height ? im_height : end_height;
-  im_data += start_height * im_width;
-  col_data += col_start_height * col_width;
-
-  int kw = 0;
-  for (; kw < kernel_w - 1; kw += 2) {
-    int w0 = padding_w - kw;
-    int w1 = padding_w - (kw + 1);
-    int col_start_width0 = w0 > 0 ? (w0 + stride_w - 1) / stride_w : 0;
-    int col_start_width1 = w1 > 0 ? (w1 + stride_w - 1) / stride_w : 0;
-    int start_width0 = kw + col_start_width0 * stride_w - padding_w;
-    int start_width1 = (kw + 1) + col_start_width1 * stride_w - padding_w;
-
-    int end_width0 = (col_width - col_start_width0) * stride_w + start_width0;
-    end_width0 = end_width0 > im_width ? im_width : end_width0;
-    int end_width1 = (col_width - col_start_width1) * stride_w + start_width1;
-    end_width1 = end_width1 > im_width ? im_width : end_width1;
-    int start_width = 0;
-    int end_width = 0;
-    if (stride_w == 1) {
-      start_width = std::max(start_width0, start_width1);
-      end_width = std::min(end_width0, end_width1);
-    } else if (stride_w == 2) {
-      start_width = std::min(start_width0, start_width1);
-      end_width = std::min(end_width0, end_width1);
-    } else {
-      PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2.");
-    }
-
-    //    DLOG << "start_width0: " << start_width0 << ", end_width0: " <<
-    //    end_width0; DLOG << "start_width1: " << start_width1 << ", end_width1:
-    //    " << end_width1;
-    int extend = end_width - start_width;
-    float *im_data01 = im_data + start_width;
-    float *im_data0 = im_data + start_width0;
-    float *im_data1 = im_data + start_width1;
-    const float *col_data0 = col_data + col_start_width0;
-    const float *col_data1 = col_data + col_spatial_size + col_start_width1;
-
-    for (int i = start_height; i < end_height; i += stride_h) {
-      int s = 0;
-      if (stride_w == 1) {
-        int offset0 = start_width - start_width0;
-        int offset1 = start_width - start_width1;
-        for (int ss = 0; ss < start_width - start_width0; ++ss) {
-          im_data0[ss] += col_data0[ss];
-        }
-        for (int ss = 0; ss < start_width - start_width1; ++ss) {
-          im_data1[ss] += col_data1[ss];
-        }
-#if __ARM_NEON
-        for (; s < extend - 3; s += 4) {
-          float32x4_t _col0 = vld1q_f32(col_data0 + offset0 + s);
-          float32x4_t _col1 = vld1q_f32(col_data1 + offset1 + s);
-          float32x4_t _img = vld1q_f32(im_data01 + s);
-          _img = vaddq_f32(_img, _col0);
-          _img = vaddq_f32(_img, _col1);
-          vst1q_f32(im_data01 + s, _img);
-        }
-#endif
-        for (int ss = s; ss < end_width0 - start_width0; ++ss) {
-          im_data0[ss] += col_data0[ss];
-        }
-        for (int ss = s; ss < end_width1 - start_width1; ++ss) {
-          im_data1[ss] += col_data1[ss];
-        }
-      } else if (stride_w == 2) {
-        if (start_width0 < start_width1) {
-#if __ARM_NEON
-          for (; s < extend - 7; s += 8) {
-            float32x4_t _col0 = vld1q_f32(col_data0 + s / 2);
-            float32x4_t _col1 = vld1q_f32(col_data1 + s / 2);
-            float32x4x2_t _img = vld2q_f32(im_data01 + s);
-            _img.val[0] = vaddq_f32(_img.val[0], _col0);
-            _img.val[1] = vaddq_f32(_img.val[1], _col1);
-            vst2q_f32(im_data01 + s, _img);
-          }
-#endif
-        } else {
-#if __ARM_NEON
-          for (; s < extend - 7; s += 8) {
-            float32x4_t _col0 = vld1q_f32(col_data0 + s / 2);
-            float32x4_t _col1 = vld1q_f32(col_data1 + s / 2);
-            float32x4x2_t _img = vld2q_f32(im_data01 + s);
-            _img.val[0] = vaddq_f32(_img.val[0], _col1);
-            _img.val[1] = vaddq_f32(_img.val[1], _col0);
-            vst2q_f32(im_data01 + s, _img);
-          }
-#endif
-        }
-        for (int ss = s; ss < end_width0 - start_width0; ss += 2) {
-          im_data0[ss] += col_data0[ss / 2];
-        }
-        for (int ss = s; ss < end_width1 - start_width1; ss += 2) {
-          im_data1[ss] += col_data1[ss / 2];
-        }
-      }
-
-      im_data0 += im_width * stride_h;
-      im_data1 += im_width * stride_h;
-      im_data01 += im_width * stride_h;
-      col_data0 += col_width;
-      col_data1 += col_width;
-    }
-    col_data += 2 * col_spatial_size;
-  }
-
-  for (; kw < kernel_w; ++kw) {
-    int w = padding_w - kw;
-    int col_start_width = w > 0 ? (w + stride_w - 1) / stride_w : 0;
-    int start_width = kw + col_start_width * stride_w - padding_w;
-
-    int end_width = (col_width - col_start_width) * stride_w + start_width;
-    end_width = end_width > im_width ? im_width : end_width;
-    int extend = end_width - start_width;
-
-    float *im_data0 = im_data + start_width;
-    const float *col_data0 = col_data + col_start_width;
-
-    for (int i = start_height; i < end_height; i += stride_h) {
-      int s = 0;
-      if (stride_w == 1) {
-#if __ARM_NEON
-        for (; s < extend - 3; s += 4) {
-          float32x4_t _col = vld1q_f32(col_data + s);
-          float32x4_t _img = vld1q_f32(im_data + s);
-          _img = vaddq_f32(_img, _col);
-          vst1q_f32(im_data + s, _img);
-        }
-#endif
-        for (; s < extend; ++s) {
-          im_data[s] += col_data[s];
-        }
-      } else if (stride_w == 2) {
-#if __ARM_NEON
-        for (; s < extend - 7; s += 8) {
-          float32x4_t _col = vld1q_f32(col_data + s / 2);
-          float32x4x2_t _img = vld2q_f32(im_data + s);
-          _img.val[0] = vaddq_f32(_img.val[0], _col);
-          vst2q_f32(im_data + s, _img);
-        }
-#endif
-        for (; s < extend; s += 2) {
-          im_data[s] += col_data[s / 2];
-        }
-      } else {
-        PADDLE_MOBILE_THROW_EXCEPTION("stride_w must be one of 1 and 2.");
-      }
-      im_data += im_width * stride_h;
-      col_data += col_width;
-    }
-    col_data += col_spatial_size;
-  }
-}
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [input_channels, filter_height, filter_width, output_height,
- * output_width]
- */
-template <class T>
-class Col2ImFunctor<ColFormat::kCFO, CPU, T> {
- public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im) {
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[1];
-    int filter_width = col.dims()[2];
-    int col_height = col.dims()[3];
-    int col_width = col.dims()[4];
-
-    int channels_col = im_channels * filter_height * filter_width;
-    const T *col_data = col.data<T>();
-    T *im_data = im->data<T>();
-    memset(static_cast<void *>(im_data), 0, sizeof(T) * im->numel());
-
-#if __ARM_NEON
-    if (stride[0] <= 2 && dilation[0] == 1 && dilation[0] == dilation[1]) {
-      int im_spatial_size = im_height * im_width;
-      int col_spatial_size = col_height * col_width;
-
-      #pragma omp parallel for
-      for (int ic = 0; ic < im_channels; ++ic) {
-        T *local_im_data = im_data + ic * im_spatial_size;
-        const T *local_col_data =
-            col_data + ic * filter_height * filter_width * col_spatial_size;
-        for (int kh = 0; kh < filter_height; ++kh) {
-#if 0
-          for (int kw = 0; kw < filter_width; ++kw) {
-            ExtendToImg<T>(local_col_data, local_im_data, im_height, im_width,
-                           col_height, col_width, padding[0], padding[1],
-                           stride[0], stride[1], kh, kw);
-            local_col_data += col_spatial_size;
-          }
-#else
-          ExtendToImgV2<T>(local_col_data, local_im_data, im_height, im_width,
-                           col_height, col_width, padding[0], padding[1],
-                           stride[0], stride[1], kh, filter_width);
-          local_col_data += col_spatial_size * filter_width;
-#endif
-        }
-      }
-    } else {
-#endif
-      for (int c = 0; c < channels_col; ++c) {
-        int w_offset = c % filter_width;
-        int h_offset = (c / filter_width) % filter_height;
-        int c_im = c / (filter_width * filter_height);
-        for (int h = 0; h < col_height; ++h) {
-          int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
-          for (int w = 0; w < col_width; ++w) {
-            int im_col_idx =
-                w * stride[1] - padding[1] + w_offset * dilation[1];
-            if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
-                (im_col_idx) >= 0 && (im_col_idx) < im_width) {
-              im_data[(im_row_idx + c_im * im_height) * im_width +
-                      im_col_idx] +=
-                  col_data[(c * col_height + h) * col_width + w];
-            }
-          }
-        }
-      }
-#if __ARM_NEON
-    }
-#endif
-  }
-};
-
-template class Im2ColFunctor<ColFormat::kCFO, CPU, float>;
-template class Im2ColFunctor<ColFormat::kCFO, CPU, int8_t>;
-template class Col2ImFunctor<ColFormat::kCFO, CPU, float>;
-// template class Col2ImFunctor<ColFormat::kCFO, CPU, int8_t>;
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height,
- * filter_width]
- */
-template <class T>
-class Im2ColFunctor<ColFormat::kOCF, CPU, T> {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col) {
-    int im_channels = im.dims()[0];
-    int im_height = im.dims()[1];
-    int im_width = im.dims()[2];
-    int filter_height = col->dims()[3];
-    int filter_width = col->dims()[4];
-    int col_height = col->dims()[0];
-    int col_width = col->dims()[1];
-
-    const T *im_data = im.data<T>();
-    T *col_data = col->data<T>();
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-              int col_offset =
-                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-              int im_offset = (channel * im_height + im_row_offset) * im_width +
-                              im_col_offset;
-              col_data[col_offset] =
-                  (im_row_offset < 0 || im_row_offset >= im_height ||
-                   im_col_offset < 0 || im_col_offset >= im_width)
-                      ? static_cast<T>(0)
-                      : im_data[im_offset];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * im = [input_channels, input_height, input_width]
- * col =
- *   [output_height, output_width, input_channels, filter_height,
- * filter_width]
- */
-template <class T>
-class Col2ImFunctor<ColFormat::kOCF, CPU, T> {
- public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im) {
-    int im_channels = im->dims()[0];
-    int im_height = im->dims()[1];
-    int im_width = im->dims()[2];
-    int filter_height = col.dims()[3];
-    int filter_width = col.dims()[4];
-    int col_height = col.dims()[0];
-    int col_width = col.dims()[1];
-
-    T *im_data = im->data<T>();
-    const T *col_data = col.data<T>();
-
-    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
-      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
-        for (int channel = 0; channel < im_channels; ++channel) {
-          for (int filter_row_idx = 0; filter_row_idx < filter_height;
-               ++filter_row_idx) {
-            int im_row_offset =
-                col_row_idx * stride[0] + filter_row_idx - padding[0];
-            for (int filter_col_idx = 0; filter_col_idx < filter_width;
-                 ++filter_col_idx) {
-              int im_col_offset =
-                  col_col_idx * stride[1] + filter_col_idx - padding[1];
-
-              int col_offset =
-                  (((col_row_idx * col_width + col_col_idx) * im_channels +
-                    channel) *
-                       filter_height +
-                   filter_row_idx) *
-                      filter_width +
-                  filter_col_idx;
-
-              if (im_row_offset >= 0 && im_row_offset < im_height &&
-                  im_col_offset >= 0 && im_col_offset < im_width) {
-                int im_offset =
-                    (channel * im_height + im_row_offset) * im_width +
-                    im_col_offset;
-                im_data[im_offset] += col_data[col_offset];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Im2ColFunctor<ColFormat::kOCF, CPU, float>;
-template class Col2ImFunctor<ColFormat::kOCF, CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/im2col.h b/mobile/src/operators/math/im2col.h
deleted file mode 100644
index 347f72c9177d7492d0f41d2a1abace9335422d34..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/im2col.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-/* The storage format of the coldata in the Im2ColFunctor and
- * Col2ImFunctor. */
-enum class ColFormat { kCFO = 0, kOCF = 1 };
-
-template <class T>
-void ExtractToImg(const T *im_data, T *col_data, const int im_height,
-                  const int im_width, const int col_height, const int col_width,
-                  const int padding_h, const int padding_w, const int stride_h,
-                  const int stride_w, const int kh, const int kw);
-
-template <class T>
-void ExtendToImg(const T *col_data, T *im_data, const int im_height,
-                 const int im_width, const int col_height, const int col_width,
-                 const int padding_h, const int padding_w, const int stride_h,
-                 const int stride_w, const int kh, const int kw);
-
-template <class T>
-void ExtendToImgV2(const T *col_data, T *im_data, const int im_height,
-                   const int im_width, const int col_height,
-                   const int col_width, const int padding_h,
-                   const int padding_w, const int stride_h, const int stride_w,
-                   const int kh, const int kernel_w);
-
-/*
- * \brief Converts the image data of three dimensions(CHW) into a
- * colData of
- *        five dimensions in the Im2ColFunctor calculation,
- *        And in the Col2ImFunctor calculation, it is reversed.
- *
- * \param imData   Image data.
- * \param imShape  The shape of imData,
- *                 [input_channels, input_height, input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 2-dimension  [dilation_height, dilation_width].
- *
- * \param strides      stride data.
- * \param 2-dimension  [stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
- *
- * If the template argument Format is kCFO, the shape of colData is:
- * [input_channels, filter_height, filter_width, output_height,
- * output_width]
- * So, it is easy to reshape into a convolution matrix for
- * convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the
- * height is equal
- * input_channels * filter_height * filter_width, and the width is
- * equal
- * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_height,
- *      output_width]
- *
- * If the template argument Format is kOCF, the shape of colData is:
- * [output_height, output_width, input_channels, filter_height,
- * filter_width]
- * So, it is easy to reshape into a sequence matrix for rnn
- * calculation.
- * The shape of sequence matrix is [seq_length, step_size], where
- * the seq_length
- * is equal output_height * output_width, and the step_size is equal
- * input_channels * filter_height * filter_width.
- *
- * Reshape:
- *     shape of colData             shape of sequence matrix
- *     [output_height,
- *      output_width,
- *      input_channels,    ======>    [seqLength, stepSize]
- *      filter_height,
- *      filter_width]
- *
- * \note The caller needs to ensure that imShape.inputChannels is
- * equal to
- *       colShape.inputChannels.
- */
-template <ColFormat Format, typename DeviceType, typename T>
-class Im2ColFunctor {
- public:
-  void operator()(const framework::Tensor &im, const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *col);
-};
-
-template <ColFormat Format, typename DeviceType, typename T>
-class Col2ImFunctor {
- public:
-  void operator()(const framework::Tensor &col,
-                  const std::vector<int> &dilation,
-                  const std::vector<int> &stride,
-                  const std::vector<int> &padding, framework::Tensor *im);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/math.h b/mobile/src/operators/math/math.h
deleted file mode 100644
index 8ff5019e318fa996a388f93cdda0efc0024fe0ee..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/math.h
+++ /dev/null
@@ -1,342 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/* NEON implementation of sin, cos, exp and log
- *
- *   Inspired by Intel Approximate Math library, and based on the
- *   corresponding algorithms of the cephes math library
- */
-
-/* Copyright (C) 2011  Julien Pommier
- *
- *  This software is provided 'as-is', without any express or implied
- *  warranty.  In no event will the authors be held liable for any damages
- *  arising from the use of this software.
- *
- *  Permission is granted to anyone to use this software for any purpose,
- *  including commercial applications, and to alter it and redistribute it
- *  freely, subject to the following restrictions:
- *
- *  1. The origin of this software must not be misrepresented; you must not
- *     claim that you wrote the original software. If you use this software
- *     in a product, an acknowledgment in the product documentation would be
- *     appreciated but is not required.
- *  2. Altered source versions must be plainly marked as such, and must not be
- *     misrepresented as being the original software.
- *  3. This notice may not be removed or altered from any source distribution.
- *
- *  (this is the zlib license)
- */
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-
-#pragma once
-
-#include <arm_neon.h>
-
-#define c_inv_mant_mask ~0x7f800000u
-#define c_cephes_SQRTHF 0.707106781186547524
-#define c_cephes_log_p0 7.0376836292E-2
-#define c_cephes_log_p1 -1.1514610310E-1
-#define c_cephes_log_p2 1.1676998740E-1
-#define c_cephes_log_p3 -1.2420140846E-1
-#define c_cephes_log_p4 +1.4249322787E-1
-#define c_cephes_log_p5 -1.6668057665E-1
-#define c_cephes_log_p6 +2.0000714765E-1
-#define c_cephes_log_p7 -2.4999993993E-1
-#define c_cephes_log_p8 +3.3333331174E-1
-#define c_cephes_log_q1 -2.12194440e-4
-#define c_cephes_log_q2 0.693359375
-
-/* natural logarithm computed for 4 simultaneous float
- *   return NaN for x <= 0
- */
-static inline float32x4_t log_ps(float32x4_t x) {
-  float32x4_t one = vdupq_n_f32(1);
-
-  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
-  uint32x4_t invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
-
-  int32x4_t ux = vreinterpretq_s32_f32(x);
-
-  int32x4_t emm0 = vshrq_n_s32(ux, 23);
-
-  /* keep only the fractional part */
-  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
-  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
-  x = vreinterpretq_f32_s32(ux);
-
-  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
-  float32x4_t e = vcvtq_f32_s32(emm0);
-
-  e = vaddq_f32(e, one);
-
-  /* part2:
-   *     if( x < SQRTHF ) {
-   *       e -= 1;
-   *       x = x + x - 1.0;
-   *     } else { x = x - 1.0; }
-   */
-  uint32x4_t mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
-  float32x4_t tmp =
-      vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-  x = vsubq_f32(x, one);
-  e = vsubq_f32(
-      e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
-  x = vaddq_f32(x, tmp);
-
-  float32x4_t z = vmulq_f32(x, x);
-
-  float32x4_t y = vdupq_n_f32(c_cephes_log_p0);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
-  y = vmulq_f32(y, x);
-
-  y = vmulq_f32(y, z);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
-  y = vaddq_f32(y, tmp);
-
-  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
-  y = vsubq_f32(y, tmp);
-
-  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
-  x = vaddq_f32(x, y);
-  x = vaddq_f32(x, tmp);
-  x = vreinterpretq_f32_u32(vorrq_u32(
-      vreinterpretq_u32_f32(x), invalid_mask));  // negative arg will be NAN
-  return x;
-}
-
-#define c_exp_hi 88.3762626647949f
-#define c_exp_lo -88.3762626647949f
-
-#define c_cephes_LOG2EF 1.44269504088896341
-#define c_cephes_exp_C1 0.693359375
-#define c_cephes_exp_C2 -2.12194440e-4
-
-#define c_cephes_exp_p0 1.9875691500E-4
-#define c_cephes_exp_p1 1.3981999507E-3
-#define c_cephes_exp_p2 8.3334519073E-3
-#define c_cephes_exp_p3 4.1665795894E-2
-#define c_cephes_exp_p4 1.6666665459E-1
-#define c_cephes_exp_p5 5.0000001201E-1
-
-/* exp() computed for 4 float at once */
-static inline float32x4_t exp_ps(float32x4_t x) {
-  float32x4_t tmp, fx;
-
-  float32x4_t one = vdupq_n_f32(1);
-  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
-  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
-
-  /* perform a floorf */
-  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
-
-  /* if greater, substract 1 */
-  uint32x4_t mask = vcgtq_f32(tmp, fx);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
-
-  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-
-  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
-  float32x4_t z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
-  x = vsubq_f32(x, tmp);
-  x = vsubq_f32(x, z);
-
-  static const float cephes_exp_p[6] = {c_cephes_exp_p0, c_cephes_exp_p1,
-                                        c_cephes_exp_p2, c_cephes_exp_p3,
-                                        c_cephes_exp_p4, c_cephes_exp_p5};
-  float32x4_t y = vld1q_dup_f32(cephes_exp_p + 0);
-  float32x4_t c1 = vld1q_dup_f32(cephes_exp_p + 1);
-  float32x4_t c2 = vld1q_dup_f32(cephes_exp_p + 2);
-  float32x4_t c3 = vld1q_dup_f32(cephes_exp_p + 3);
-  float32x4_t c4 = vld1q_dup_f32(cephes_exp_p + 4);
-  float32x4_t c5 = vld1q_dup_f32(cephes_exp_p + 5);
-
-  y = vmulq_f32(y, x);
-  z = vmulq_f32(x, x);
-
-  y = vaddq_f32(y, c1);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c2);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c3);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c4);
-  y = vmulq_f32(y, x);
-  y = vaddq_f32(y, c5);
-
-  y = vmulq_f32(y, z);
-  y = vaddq_f32(y, x);
-  y = vaddq_f32(y, one);
-
-  /* build 2^n */
-  int32x4_t mm;
-  mm = vcvtq_s32_f32(fx);
-  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
-  mm = vshlq_n_s32(mm, 23);
-  float32x4_t pow2n = vreinterpretq_f32_s32(mm);
-
-  y = vmulq_f32(y, pow2n);
-  return y;
-}
-
-#define c_minus_cephes_DP1 -0.78515625
-#define c_minus_cephes_DP2 -2.4187564849853515625e-4
-#define c_minus_cephes_DP3 -3.77489497744594108e-8
-#define c_sincof_p0 -1.9515295891E-4
-#define c_sincof_p1 8.3321608736E-3
-#define c_sincof_p2 -1.6666654611E-1
-#define c_coscof_p0 2.443315711809948E-005
-#define c_coscof_p1 -1.388731625493765E-003
-#define c_coscof_p2 4.166664568298827E-002
-#define c_cephes_FOPI 1.27323954473516  // 4 / M_PI
-
-/* evaluation of 4 sines & cosines at once.
- *
- *   The code is the exact rewriting of the cephes sinf function.
- *   Precision is excellent as long as x < 8192 (I did not bother to
- *   take into account the special handling they have for greater values
- *   -- it does not return garbage for arguments over 8192, though, but
- *   the extra precision is missing).
- *
- *   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
- *   surprising but correct result.
- *
- *   Note also that when you compute sin(x), cos(x) is available at
- *   almost no extra price so both sin_ps and cos_ps make use of
- *   sincos_ps..
- */
-static inline void sincos_ps(float32x4_t x, float32x4_t *ysin,
-                             float32x4_t *ycos) {
-  // any x
-  float32x4_t xmm1, xmm2, xmm3, y;
-
-  uint32x4_t emm2;
-
-  uint32x4_t sign_mask_sin, sign_mask_cos;
-  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
-  x = vabsq_f32(x);
-
-  /* scale by 4/Pi */
-  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
-
-  /* store the integer part of y in mm0 */
-  emm2 = vcvtq_u32_f32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
-  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
-  y = vcvtq_f32_u32(emm2);
-
-  /* get the polynom selection mask
-   *     there is one polynom for 0 <= x <= Pi/4
-   *     and another one for Pi/4<x<=Pi/2
-   *
-   *     Both branches will be computed.
-   */
-  uint32x4_t poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
-
-  /* The magic pass: "Extended precision modular arithmetic"
-   *     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
-  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
-  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
-  x = vaddq_f32(x, xmm1);
-  x = vaddq_f32(x, xmm2);
-  x = vaddq_f32(x, xmm3);
-
-  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
-  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1,
-   *     and the second polynom      (Pi/4 <= x <= 0) in y2 */
-  float32x4_t z = vmulq_f32(x, x);
-  float32x4_t y1, y2;
-
-  y1 = vmulq_n_f32(z, c_coscof_p0);
-  y2 = vmulq_n_f32(z, c_sincof_p0);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
-  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, z);
-  y1 = vmulq_f32(y1, z);
-  y2 = vmulq_f32(y2, x);
-  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
-  y2 = vaddq_f32(y2, x);
-  y1 = vaddq_f32(y1, vdupq_n_f32(1));
-
-  /* select the correct result from the two polynoms */
-  float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
-  float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
-  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
-  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
-}
-
-static inline float32x4_t sin_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ysin;
-}
-
-static inline float32x4_t cos_ps(float32x4_t x) {
-  float32x4_t ysin, ycos;
-  sincos_ps(x, &ysin, &ycos);
-  return ycos;
-}
-
-static inline float32x4_t div_ps(float32x4_t a, float32x4_t b) {
-  float32x4_t reciprocal = vrecpeq_f32(b);
-  reciprocal = vmulq_f32(vrecpsq_f32(b, reciprocal), reciprocal);
-  return vmulq_f32(a, reciprocal);
-}
-
-static inline float32x4_t pow_ps(float32x4_t a, float32x4_t b) {
-  return exp_ps(vmulq_f32(b, log_ps(a)));
-}
-
-#ifndef __aarch64__
-inline float32x4_t vpaddq_f32(float32x4_t r0, float32x4_t r1) {
-  float32x2_t sum0 = vpadd_f32(vget_low_f32(r0), vget_high_f32(r0));
-  float32x2_t sum1 = vpadd_f32(vget_low_f32(r1), vget_high_f32(r1));
-  return vcombine_f32(sum0, sum1);
-}
-#endif
-
-inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
-}
-
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/math/math_function.cpp b/mobile/src/operators/math/math_function.cpp
deleted file mode 100644
index 8724d04b67740ea7e3aac006bd3bf139a5edde0a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/math_function.cpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/math_function.h"
-#include <string>
-#include "common/enforce.h"
-#include "framework/data_type.h"
-#include "framework/tensor.h"
-#include "operators/math/gemm.h"
-#include "operators/math/gemm/cblas.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-struct TensorSetConstant {
-  TensorSetConstant(framework::Tensor *tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto *begin = tensor_->mutable_data<T>();
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor *tensor_;
-  float value_;
-};
-
-void SetConstant(framework::Tensor *tensor, float value) {
-  framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(tensor, value));
-}
-
-template <>
-void MatMul<float, float>(const framework::Tensor &matrix_a, bool trans_a,
-                          const framework::Tensor &matrix_b, bool trans_b,
-                          float alpha, framework::Tensor *matrix_out,
-                          float beta, bool relu, float *bias) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-  int ldb = (!trans_b) ? dim_b[1] : dim_b[0];
-
-  Gemm gemm;
-  if (trans_a) {
-    framework::Tensor matrix_trans;
-    int numel = matrix_a.numel();
-    int m = matrix_a.dims()[0];
-    int n = matrix_a.dims()[1];
-    float *tmp = (float *)(matrix_a.data<float>());  // NOLINT
-    float *a = matrix_trans.mutable_data<float>(matrix_a.dims());
-    int index = 0;
-    for (int j = 0; j < n; j++) {
-      for (int i = 0; i < m; i++) {
-        a[index++] = tmp[i * n + j];
-      }
-    }
-    cblas_sgemm(false, trans_b, M, N, K, alpha, a, K, matrix_b.data<float>(),
-                ldb, beta, matrix_out->data<float>(), N);
-  } else {
-    cblas_sgemm(false, trans_b, M, N, K, alpha, matrix_a.data<float>(), K,
-                matrix_b.data<float>(), ldb, beta, matrix_out->data<float>(),
-                N);
-  }
-}
-
-void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
-                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                  framework::Tensor *matrix_out, float beta, bool relu,
-                  framework::Tensor *new_scale, framework::Tensor *new_bias,
-                  int group, float *bias) {
-  Gemm gemm;
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-
-#ifdef _OPENMP
-  gemm.SgemmWithBn_omp(
-      M, N, K, alpha, matrix_a.data<float>(), K, matrix_b.data<float>(), N,
-      beta, matrix_out->data<float>(), N, relu,
-      new_scale->data<float>() + group, new_bias->data<float>() + group, bias);
-#else
-  gemm.SgemmWithBn(M, N, K, alpha, matrix_a.data<float>(), K,
-                   matrix_b.data<float>(), N, beta, matrix_out->data<float>(),
-                   N, relu, new_scale->data<float>() + group,
-                   new_bias->data<float>() + group, bias);
-#endif
-}
-void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
-                     const framework::Tensor &matrix_b, bool trans_b,
-                     framework::Tensor *matrix_out, float *p, std::string mode,
-                     float *bias, float *bias1) {
-  Gemm gemm;
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = (!trans_a) ? dim_a[1] : dim_a[0];
-
-#ifdef _OPENMP
-  gemm.SgemmWithPRelu_omp(M, N, K, matrix_a.data<float>(), K,
-                          matrix_b.data<float>(), N, matrix_out->data<float>(),
-                          N, p, mode, bias, bias1);
-#else
-  gemm.SgemmWithPRelu(M, N, K, matrix_a.data<float>(), K,
-                      matrix_b.data<float>(), N, matrix_out->data<float>(), N,
-                      p, mode, bias, bias1);
-#endif
-}
-
-template <typename T>
-struct ClearTensor<CPU, T> {
-  void operator()(framework::Tensor *tensor) {
-    auto size = tensor->numel();
-    auto *tensor_data = tensor->data<T>();
-    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
-  }
-};
-
-template <typename T>
-struct RowwiseAdd<CPU, T> {
-  void operator()(const framework::Tensor &input,
-                  const framework::Tensor &vector, framework::Tensor *output) {
-    auto in_dims = input.dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_MOBILE_ENFORCE((vector.numel() == size),
-                          "vector.numel() must be equal to size.");
-    PADDLE_MOBILE_ENFORCE((output->dims() == in_dims),
-                          "output->dims() must be equal to in_dims.");
-
-    auto *input_data = input.data<T>();
-    auto *out_data = output->data<T>();
-    auto *vec_data = vector.data<T>();
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      for (int64_t j = 0; j < size; ++j) {
-        out_data[i * size + j] = input_data[i * size + j] + vec_data[j];
-      }
-    }
-  }
-};
-
-template struct RowwiseAdd<CPU, float>;
-template struct ClearTensor<CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/math_function.h b/mobile/src/operators/math/math_function.h
deleted file mode 100644
index ccc1a2b931a0f2133f25adefc2f9466c02c39fb4..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/math_function.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void SetConstant(framework::Tensor *tensor, float value);
-
-template <typename Itype, typename Otype>
-void MatMul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, float beta, bool relu = false,
-            Otype *bias = nullptr);
-
-template <typename Itype, typename Otype>
-void MatMul(const framework::Tensor &matrix_a, bool trans_a,
-            const framework::Tensor &matrix_b, bool trans_b, float alpha,
-            framework::Tensor *matrix_out, float beta, bool relu, Otype *bias,
-            bool addOnRow);
-
-void MatMulWithBn(const framework::Tensor &matrix_a, bool trans_a,
-                  const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                  framework::Tensor *matrix_out, float beta, bool relu,
-                  framework::Tensor *new_scale, framework::Tensor *new_bias,
-                  int group, float *bias = nullptr);
-
-void MatMulWithPRelu(const framework::Tensor &matrix_a, bool trans_a,
-                     const framework::Tensor &matrix_b, bool trans_b,
-                     framework::Tensor *matrix_out, float *p, std::string mode,
-                     float *bias, float *bias1);
-
-template <typename Device, typename T>
-struct ClearTensor {
-  void operator()(framework::Tensor *tensor);
-};
-
-template <typename Device, typename T>
-struct RowwiseAdd {
-  void operator()(const framework::Tensor &input, const framework::Tensor &vec,
-                  framework::Tensor *output);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/math_function_int8.cpp b/mobile/src/operators/math/math_function_int8.cpp
deleted file mode 100644
index 0595a808f0540a0fa5134e72845992e04d125873..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/math_function_int8.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <string>
-#include "operators/math/gemm.h"
-#include "operators/math/math_function.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
-                             const framework::Tensor &matrix_b, bool trans_b,
-                             float alpha, framework::Tensor *matrix_out,
-                             float beta, bool relu, int32_t *bias,
-                             bool addOnRow) {
-  auto dim_a = matrix_a.dims();
-  auto dim_b = matrix_b.dims();
-  auto dim_out = matrix_out->dims();
-  PADDLE_MOBILE_ENFORCE(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      "The input and output of MatMul be matrix");
-
-  int32_t M = dim_out[0];
-  int32_t N = dim_out[1];
-  int32_t K = (!trans_a) ? dim_a[1] : dim_a[0];
-  Gemm gemm;
-
-  if (trans_a) {
-    int32_t numel = matrix_a.numel();
-    int32_t m = matrix_a.dims()[0];
-    int32_t n = matrix_a.dims()[1];
-    int8_t *tmp = (int8_t *)(matrix_a.data<int8_t>());  // NOLINT
-    int8_t *a = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * numel));
-    int32_t index = 0;
-    for (int32_t j = 0; j < n; j++) {
-      for (int32_t i = 0; i < m; i++) {
-        a[index++] = tmp[i * n + j];
-      }
-    }
-
-#ifdef _OPENMP
-    if (bias != nullptr) {
-      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
-    }
-#else
-    if (bias != nullptr) {
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
-    }
-#endif
-  } else {
-#ifdef _OPENMP
-    if (bias != nullptr) {
-      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
-    }
-#else
-    if (bias != nullptr) {
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
-                 N, relu, bias, addOnRow);
-    } else {
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
-                 N, relu, bias, addOnRow);
-    }
-#endif
-  }
-}
-
-template <>
-void MatMul<int8_t, int32_t>(const framework::Tensor &matrix_a, bool trans_a,
-                             const framework::Tensor &matrix_b, bool trans_b,
-                             float alpha, framework::Tensor *matrix_out,
-                             float beta, bool relu, int32_t *bias) {
-  MatMul<int8_t, int32_t>(matrix_a, trans_a, matrix_b, trans_b, alpha,
-                          matrix_out, beta, relu, bias, false);
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/pad.cpp b/mobile/src/operators/math/pad.cpp
deleted file mode 100644
index 49fede1eb30d8cabcabb4dd4828e43eb8900a2f9..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/pad.cpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/pad.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-class PadFunctor<CPU, T> {
- public:
-  void operator()(const framework::Tensor &input, const int pad_top,
-                  const int pad_bottom, const int pad_left, const int pad_right,
-                  framework::Tensor *output) {
-    const T *in_data = input.data<T>();
-    T *out_data = output->mutable_data<T>();
-    // should check output shape is valid for such pad parameters
-    const framework::DDim &input_shape = input.dims();
-    const framework::DDim &output_shape = output->dims();
-    // fill output with 0
-    memset(out_data, 0, sizeof(T) * output->numel());
-    // should make sure the shape of output is match with input
-    for (int i = 0; i < input_shape[0]; ++i) {
-      for (int c = 0; c < input_shape[1]; ++c) {
-        out_data += pad_top * output_shape[3];
-        for (int h = 0; h < input_shape[2]; ++h) {
-          memcpy(out_data + pad_left, in_data, sizeof(T) * input_shape[3]);
-          out_data += output_shape[3];
-          in_data += input_shape[3];
-        }
-        out_data += pad_bottom * output_shape[3];
-      }
-    }
-  }
-};
-
-template class PadFunctor<CPU, float>;
-template class PadFunctor<CPU, int8_t>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/pad.h b/mobile/src/operators/math/pad.h
deleted file mode 100644
index 9031caf36a872d091b333570320955e7fc30f78a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/pad.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename DeviceType, typename T>
-class PadFunctor {
- public:
-  void operator()(const framework::Tensor &input, const int pad_top,
-                  const int pad_bottom, const int pad_left, const int pad_right,
-                  framework::Tensor *output);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/poly_util.cpp b/mobile/src/operators/math/poly_util.cpp
deleted file mode 100644
index 1cc1e2a40374204c8644267e8ab84af3cba5c65a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/poly_util.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/math/poly_util.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <class T>
-void Array2PointVec(const T* box, const size_t box_size,
-                    std::vector<Point_<T>>* vec) {
-  size_t pts_num = box_size / 2;
-  vec->resize(pts_num);
-  for (size_t i = 0; i < pts_num; i++) {
-    vec->at(i).x = box[2 * i];
-    vec->at(i).y = box[2 * i + 1];
-  }
-}
-
-template <class T>
-void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly) {
-  size_t pts_num = box_size / 2;
-  poly->num_contours = 1;
-  poly->hole = reinterpret_cast<int*>(malloc(sizeof(int)));
-  poly->hole[0] = 0;
-  poly->contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list));
-  poly->contour->num_vertices = pts_num;
-  poly->contour->vertex =
-      (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num);
-  for (size_t i = 0; i < pts_num; ++i) {
-    poly->contour->vertex[i].x = box[2 * i];
-    poly->contour->vertex[i].y = box[2 * i + 1];
-  }
-}
-
-template void Array2Poly(const float* box, const size_t box_size,
-                         gpc::gpc_polygon* poly);
-
-template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
-                   std::vector<Point_<T>>* vec) {
-  int pts_num = contour.num_vertices;
-  vec->resize(pts_num);
-  for (size_t i = 0; i < pts_num; i++) {
-    vec->at(i).x = contour.vertex[i].x;
-    vec->at(i).y = contour.vertex[i].y;
-  }
-}
-
-template <class T>
-T GetContourArea(const std::vector<Point_<T>>& vec) {
-  int pts_num = vec.size();
-  if (pts_num < 3) return T(0.);
-  T area = T(0.);
-  for (size_t i = 0; i < pts_num; ++i) {
-    area += vec[i].x * vec[(i + 1) % pts_num].y -
-            vec[i].y * vec[(i + 1) % pts_num].x;
-  }
-  return fabs(area / 2.0);
-}
-
-template <class T>
-T PolyArea(const T* box, const size_t box_size, const bool normalized) {
-  // If coordinate values are is invalid
-  // if area size <= 0,  return 0.
-  std::vector<Point_<T>> vec;
-  Array2PointVec<T>(box, box_size, &vec);
-  return GetContourArea<T>(vec);
-}
-
-template float PolyArea(const float* box, const size_t box_size,
-                        const bool normalized);
-
-template <class T>
-T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
-                  const bool normalized) {
-  gpc::gpc_polygon poly1;
-  gpc::gpc_polygon poly2;
-  Array2Poly<T>(box1, box_size, &poly1);
-  Array2Poly<T>(box2, box_size, &poly2);
-  gpc::gpc_polygon respoly;
-  gpc::gpc_op op = gpc::GPC_INT;
-  gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly);
-
-  T inter_area = T(0.);
-  int contour_num = respoly.num_contours;
-  for (int i = 0; i < contour_num; ++i) {
-    std::vector<Point_<T>> resvec;
-    Poly2PointVec<T>(respoly.contour[i], &resvec);
-    inter_area += GetContourArea<T>(resvec);
-  }
-
-  gpc::gpc_free_polygon(&poly1);
-  gpc::gpc_free_polygon(&poly2);
-  gpc::gpc_free_polygon(&respoly);
-  return inter_area;
-}
-
-template float PolyOverlapArea(const float* box1, const float* box2,
-                               const size_t box_size, const bool normalized);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/poly_util.h b/mobile/src/operators/math/poly_util.h
deleted file mode 100644
index 96951a0ab1ff9ab25553b7290cfbb4a21c54cfc8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/poly_util.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-#pragma once
-
-#include <vector>
-#include "operators/math/gpc.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <class T>
-class Point_ {
- public:
-  // default constructor
-  Point_() {}
-  Point_(T _x, T _y) {}
-  Point_(const Point_& pt) {}
-
-  Point_& operator=(const Point_& pt);
-  // conversion to another data type
-  // template<typename _T> operator Point_<_T>() const;
-  // conversion to the old-style C structures
-  // operator Vec<T, 2>() const;
-
-  // checks whether the point is inside the specified rectangle
-  // bool inside(const Rect_<T>& r) const;
-  T x;  //!< x coordinate of the point
-  T y;  //!< y coordinate of the point
-};
-
-template <class T>
-void Array2PointVec(const T* box, const size_t box_size,
-                    std::vector<Point_<T>>* vec);
-
-template <class T>
-void Array2Poly(const T* box, const size_t box_size, gpc::gpc_polygon* poly);
-
-template <class T>
-void Poly2PointVec(const gpc::gpc_vertex_list& contour,
-                   std::vector<Point_<T>>* vec);
-
-template <class T>
-T GetContourArea(const std::vector<Point_<T>>& vec);
-
-template <class T>
-T PolyArea(const T* box, const size_t box_size, const bool normalized);
-
-template <class T>
-T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size,
-                  const bool normalized);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/pooling.cpp b/mobile/src/operators/math/pooling.cpp
deleted file mode 100644
index 46b4453e73dfac0ab8b5755e1c5cd472584be0a8..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/pooling.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/math/pooling.h"
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <PoolingType P>
-void Pooling<P>::operator()(const framework::Tensor &input,
-                            const std::vector<int> &kernel_size,
-                            const std::vector<int> &strides,
-                            const std::vector<int> &paddings,
-                            framework::Tensor *output) {
-  const int batch_size = input.dims()[0];
-  const int input_height = input.dims()[2];
-  const int input_width = input.dims()[3];
-  const int output_channels = output->dims()[1];
-  const int output_height = output->dims()[2];
-  const int output_width = output->dims()[3];
-  const int ksize_height = kernel_size[0];
-  const int ksize_width = kernel_size[1];
-  const int stride_height = strides[0];
-  const int stride_width = strides[1];
-  const int padding_height = paddings[0];
-  const int padding_width = paddings[1];
-
-  const float *input_data = input.data<float>();
-  float *output_data = output->mutable_data<float>();
-  const size_t input_spatial_size = input_height * input_width;
-  const size_t output_spatial_size = output_height * output_width;
-
-  #pragma omp parallel for collapse(2)
-  for (int i = 0; i < batch_size; i++) {
-    for (int c = 0; c < output_channels; ++c) {
-      int channel = i * output_channels + c;
-      const float *input_ptr = input_data + channel * input_spatial_size;
-      float *output_ptr = output_data + channel * output_spatial_size;
-
-      for (int ph = 0; ph < output_height; ++ph) {
-        int hstart = ph * stride_height - padding_height;
-        int hend = std::min(hstart + ksize_height, input_height);
-        hstart = std::max(hstart, 0);
-        for (int pw = 0; pw < output_width; ++pw) {
-          int wstart = pw * stride_width - padding_width;
-          int wend = std::min(wstart + ksize_width, input_width);
-          wstart = std::max(wstart, 0);
-
-          PoolingVal<P> val;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              val += input_ptr[h * input_width + w];
-            }
-          }
-          output_ptr[ph * output_width + pw] = val.Value();
-        }
-      }
-    }
-  }
-}
-
-template struct Pooling<MAX>;
-template struct Pooling<AVG>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // POOL_OP
diff --git a/mobile/src/operators/math/pooling.h b/mobile/src/operators/math/pooling.h
deleted file mode 100644
index 70280ad0a0ff24d5df58349b3bf98104ffb6d2ea..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/pooling.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-#include "common/types.h"
-#include "framework/tensor.h"
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <PoolingType P = MAX>
-struct PoolingVal {
-  float val;
-  int count;
-  PoolingVal() : count(0) { val = -std::numeric_limits<float>::max(); }
-  inline PoolingVal<P> &operator+=(const float &x) {
-    val = std::max(val, x);
-    ++count;
-    return *this;
-  }
-  inline float Value() { return (count > 0) ? val : 0.f; }
-  inline float ExclusiveSum(int total) {
-    return ((count > 0) ? val : 0.f) * total;
-  }
-};
-
-template <>
-struct PoolingVal<AVG> {
-  float val;
-  int count;
-  PoolingVal() : val(0.f), count(0) {}
-  inline PoolingVal<AVG> &operator+=(const float &x) {
-    val += x;
-    ++count;
-    return *this;
-  }
-  inline float Value() { return (count > 0) ? val * (1.f / count) : 0.f; }
-  inline float ExclusiveSum(int total) { return (count > 0) ? val : 0.f; }
-};
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-template <PoolingType P = MAX>
-inline float32x4_t vPoolInitq_f32() {
-  return vdupq_n_f32(-std::numeric_limits<float>::max());
-}
-
-template <>
-inline float32x4_t vPoolInitq_f32<AVG>() {
-  return vdupq_n_f32(0.f);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vPoolInit_f32() {
-  return vdup_n_f32(-std::numeric_limits<float>::max());
-}
-
-template <>
-inline float32x2_t vPoolInit_f32<AVG>() {
-  return vdup_n_f32(0.f);
-}
-
-template <PoolingType P = MAX>
-inline float32x4_t vPoolPreq_f32(const float32x4_t &x1, const float32x4_t &x2) {
-  return vmaxq_f32(x1, x2);
-}
-
-template <>
-inline float32x4_t vPoolPreq_f32<AVG>(const float32x4_t &x1,
-                                      const float32x4_t &x2) {
-  return vaddq_f32(x1, x2);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) {
-  return vmax_f32(x1, x2);
-}
-
-template <>
-inline float32x2_t vPoolPre_f32<AVG>(const float32x2_t &x1,
-                                     const float32x2_t &x2) {
-  return vadd_f32(x1, x2);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vpPoolPre_f32(const float32x2_t &x1, const float32x2_t &x2) {
-  return vpmax_f32(x1, x2);
-}
-
-template <>
-inline float32x2_t vpPoolPre_f32<AVG>(const float32x2_t &x1,
-                                      const float32x2_t &x2) {
-  return vpadd_f32(x1, x2);
-}
-
-template <PoolingType P = MAX>
-inline float32x4_t vPoolPostq_f32(const float32x4_t &x,
-                                  const float32x4_t &post) {
-  return x;
-}
-
-template <>
-inline float32x4_t vPoolPostq_f32<AVG>(const float32x4_t &x,
-                                       const float32x4_t &post) {
-  return vmulq_f32(x, post);
-}
-
-template <PoolingType P = MAX>
-inline float32x2_t vPoolPost_f32(const float32x2_t &x,
-                                 const float32x2_t &post) {
-  return x;
-}
-
-template <>
-inline float32x2_t vPoolPost_f32<AVG>(const float32x2_t &x,
-                                      const float32x2_t &post) {
-  return vmul_f32(x, post);
-}
-#endif  // __ARM_NEON__
-
-template <PoolingType P = MAX>
-inline float PoolPre(const float &x1, const float &x2) {
-  return std::max(x1, x2);
-}
-
-template <>
-inline float PoolPre<AVG>(const float &x1, const float &x2) {
-  return x1 + x2;
-}
-
-template <PoolingType P = MAX>
-inline float PoolPost(const float &x, const float &post) {
-  return x;
-}
-
-template <>
-inline float PoolPost<AVG>(const float &x, const float &post) {
-  return x * post;
-}
-
-template <PoolingType P>
-struct Pooling {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &kernel_size,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling2x2 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling3x3 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, const bool exclusive,
-                  framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling5x5 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-template <PoolingType P, int Stride>
-struct Pooling7x7 {
-  void operator()(const framework::Tensor &input,
-                  const std::vector<int> &paddings, framework::Tensor *output);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/pooling2x2.cpp b/mobile/src/operators/math/pooling2x2.cpp
deleted file mode 100644
index 1d8845ce69743b32f5901e0b6fa8c92b9cc05d0b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/pooling2x2.cpp
+++ /dev/null
@@ -1,791 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-
-#include <arm_neon.h>
-#include "operators/math/pooling.h"
-
-// TODO(hjchen2): Optimize Pooling2x2NormalRow and use inline assembly
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define POOLING2X2_NORMAL_BORDER(start, end)                   \
-  for (int w = start; w < end; ++w) {                          \
-    const int w_in_start = -padding_w + w * Stride;            \
-    const int w_in_end = w_in_start + 2;                       \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;       \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w; \
-    PoolingVal<P> val;                                         \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {           \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {         \
-        val += input[h_in * input_w + w_in];                   \
-      }                                                        \
-    }                                                          \
-    output_ptr[w] = val.Value();                               \
-  }
-
-template <PoolingType P, int Stride = 1>
-struct Pooling2x2NormalRowLoadInput {
-  void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) {
-    x0[0] = vld1q_f32(input);
-    x0[1] = vld1q_f32(input + 4);
-    x1[0] = vextq_f32(x0[0], x0[1], 1);
-    x1[1] = vextq_f32(x0[1], x0[1], 1);
-  }
-};
-
-template <PoolingType P>
-struct Pooling2x2NormalRowLoadInput<P, 2> {
-  void operator()(const float *input, float32x4_t *x0, float32x4_t *x1) {
-    float32x4x2_t t0 = vld2q_f32(input);
-    float32x4x2_t t1 = vld2q_f32(input + 8);
-    x0[0] = t0.val[0];
-    x0[1] = t1.val[0];
-    x1[0] = t0.val[1];
-    x1[1] = t1.val[1];
-  }
-};
-
-template <PoolingType P, int Stride>
-inline void Pooling2x2NormalRow(const float *input, const int h_output,
-                                const int input_h, const int input_w,
-                                const int padding_h, const int padding_w,
-                                const int output_w, float *output) {
-  const int h_in_start = -padding_h + h_output * Stride;
-  const int h_in_end = h_in_start + 2;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  float *output_ptr = output + h_output * output_w;
-  if (h_end - h_start <= 0) {
-    memset(output_ptr, 0, output_w * sizeof(float));
-    return;
-  }
-
-  const int valid_w_start = (padding_w + Stride - 1) / Stride;
-  const int valid_w_end = (input_w + padding_w - 2) / Stride + 1;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  // border left
-  POOLING2X2_NORMAL_BORDER(0, valid_w_start)
-  // valid w
-  Pooling2x2NormalRowLoadInput<P, Stride> load_input;
-  int output_tiles = valid_w / 6;
-  int output_tiles_w = output_tiles * 6;
-  float32x4_t x0[2], x1[2], y0[2];
-  float32x4_t post = vdupq_n_f32(1.f / (2 * (h_end - h_start)));
-  for (int w = 0; w < output_tiles_w; w += 6) {
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride - padding_w;
-    y0[0] = vPoolInitq_f32<P>();
-    y0[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      load_input(input + h_in * input_w + input_w_offset, x0, x1);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x0[0]);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x1[0]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x0[1]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x1[1]);
-    }
-    y0[0] = vPoolPostq_f32<P>(y0[0], post);
-    y0[1] = vPoolPostq_f32<P>(y0[1], post);
-    vst1q_f32(output_ptr + output_offset, y0[0]);
-    vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0[1]));
-  }
-  // remain valid w
-  int remain = valid_w - output_tiles_w;
-  if (remain > 0) {
-    int remain_start = valid_w_start + output_tiles_w;
-    int input_w_offset = remain_start * Stride - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-    y0[0] = vPoolInitq_f32<P>();
-    y0[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      load_input(input + h_in * input_w + input_w_offset, x0, x1);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x0[0]);
-      y0[0] = vPoolPreq_f32<P>(y0[0], x1[0]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x0[1]);
-      y0[1] = vPoolPreq_f32<P>(y0[1], x1[1]);
-    }
-    y0[0] = vPoolPostq_f32<P>(y0[0], post);
-    y0[1] = vPoolPostq_f32<P>(y0[1], post);
-    switch (remain) {
-      case 1:
-        vst1q_lane_f32(output_ptr0, y0[0], 0);
-        break;
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(y0[0]));
-        break;
-      case 3:
-        vst1_f32(output_ptr0, vget_low_f32(y0[0]));
-        vst1q_lane_f32(output_ptr0 + 2, y0[0], 2);
-        break;
-      case 4:
-        vst1q_f32(output_ptr0, y0[0]);
-        break;
-      case 5:
-        vst1q_f32(output_ptr0, y0[0]);
-        vst1q_lane_f32(output_ptr0 + 4, y0[1], 0);
-        break;
-    }
-  }
-  // border right
-  POOLING2X2_NORMAL_BORDER(valid_w_end, output_w)
-}
-
-template <PoolingType P>
-struct Pooling2x2<P, 1> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = padding_h;
-    int valid_h_end = output_h - valid_h_start;
-    int valid_h = valid_h_end - valid_h_start;
-    int valid_w_start = padding_w;
-    int valid_w_end = output_w - valid_w_start;
-    int valid_w = valid_w_end - valid_w_start;
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling2x2NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 6;
-        int output_w_remain = valid_w - output_w_tiles * 6;
-        for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          const float *input_ptr4 = input_ptr3 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          float *output_ptr2 = output_ptr1 + output_w;
-          float *output_ptr3 = output_ptr2 + output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-                output_ptr2[w] = 0.f;
-                output_ptr3[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr1, *input_ptr2);
-                float acc2 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                float acc3 = PoolPre<P>(*input_ptr3, *input_ptr4);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-                output_ptr1[w] = PoolPost<P>(acc1, 0.5f);
-                output_ptr2[w] = PoolPost<P>(acc2, 0.5f);
-                output_ptr3[w] = PoolPost<P>(acc3, 0.5f);
-              }
-            }
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-            output_ptr2 += valid_w_start;
-            output_ptr3 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, q0;
-          float32x4x2_t y0, y1;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vld1q_f32(input_ptr3);
-            x1.val[1] = vld1q_f32(input_ptr3 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr1, y1.val[0]);
-            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr2, y0.val[0]);
-            vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], x0.val[0]);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], x0.val[1]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], q0.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr3, y1.val[0]);
-            vst1_f32(output_ptr3 + 4, vget_low_f32(y1.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            input_ptr2 += 6;
-            input_ptr3 += 6;
-            input_ptr4 += 6;
-            output_ptr0 += 6;
-            output_ptr1 += 6;
-            output_ptr2 += 6;
-            output_ptr3 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            float32x4x2_t y2, y3;
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y1.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vld1q_f32(input_ptr3);
-            x1.val[1] = vld1q_f32(input_ptr3 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y1.val[0], y2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y1.val[1], y2.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y3.val[0] = vPoolPreq_f32<P>(x1.val[0], q0.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x1.val[1], q0.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(y2.val[0], y3.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(y2.val[1], y3.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y3.val[0] = vPoolPreq_f32<P>(y3.val[0], x0.val[0]);
-            y3.val[0] = vPoolPreq_f32<P>(y3.val[0], q0.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(y3.val[1], x0.val[1]);
-            y3.val[1] = vPoolPreq_f32<P>(y3.val[1], q0.val[1]);
-            y3.val[0] = vPoolPostq_f32<P>(y3.val[0], post);
-            y3.val[1] = vPoolPostq_f32<P>(y3.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                vst1q_lane_f32(output_ptr1, y1.val[0], 0);
-                vst1q_lane_f32(output_ptr2, y2.val[0], 0);
-                vst1q_lane_f32(output_ptr3, y3.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2);
-                vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2);
-                vst1q_lane_f32(output_ptr3 + 2, y3.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0);
-                vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0);
-                vst1q_lane_f32(output_ptr3 + 4, y3.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            input_ptr2 += output_w_remain;
-            input_ptr3 += output_w_remain;
-            input_ptr4 += output_w_remain;
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-            output_ptr2 += output_w_remain;
-            output_ptr3 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-                *output_ptr2 = 0.f;
-                *output_ptr3 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr1, *input_ptr2);
-                float acc2 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                float acc3 = PoolPre<P>(*input_ptr3, *input_ptr4);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-                *output_ptr1 = PoolPost<P>(acc1, 0.5f);
-                *output_ptr2 = PoolPost<P>(acc2, 0.5f);
-                *output_ptr3 = PoolPost<P>(acc3, 0.5f);
-              }
-              output_ptr0++;
-              output_ptr1++;
-              output_ptr2++;
-              output_ptr3++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + (valid_h & 0xFFFFFFFC);
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-              }
-            }
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, q0, y0;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], x1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], q0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            output_ptr0 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vld1q_f32(input_ptr1);
-            x1.val[1] = vld1q_f32(input_ptr1 + 4);
-            q0.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            q0.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], q0.val[1]);
-
-            q0.val[0] = vextq_f32(x1.val[0], x1.val[1], 1);
-            q0.val[1] = vextq_f32(x1.val[1], x1.val[1], 1);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], x1.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y0.val[0], q0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y0.val[1], q0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling2x2NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template <PoolingType P>
-struct Pooling2x2<P, 2> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = (padding_h + 1) / 2;
-    int valid_h_end = (input_h + padding_h) / 2;
-    int valid_h = valid_h_end - valid_h_start;
-    int valid_w_start = (padding_w + 1) / 2;
-    int valid_w_end = (input_w + padding_w) / 2;
-    int valid_w = valid_w_end - valid_w_start;
-
-    bool ceil_mode = (((input_h + 2 * padding_h) / 2) < output_h) ||
-                     (((input_w + 2 * padding_w) / 2) < output_w);
-    int padding_b =
-        padding_h + (ceil_mode ? 2 * output_h - (input_h + 2 * padding_h) : 0);
-    int padding_r =
-        padding_w + (ceil_mode ? 2 * output_w - (input_w + 2 * padding_w) : 0);
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling2x2NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 4;
-        int output_w_remain = valid_w - output_w_tiles * 4;
-        for (int h = valid_h_start; h < valid_h_end - 1; h += 2) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w * 2;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-                output_ptr1[w] = PoolPost<P>(acc1, 0.5f);
-              }
-            }
-            input_ptr0 += (padding_w & 0x1);
-            input_ptr1 += (padding_w & 0x1);
-            input_ptr2 += (padding_w & 0x1);
-            input_ptr3 += (padding_w & 0x1);
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2, x3;
-          float32x4_t y0, y1;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            x2 = vld2q_f32(input_ptr2);
-            x3 = vld2q_f32(input_ptr3);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y1 = vPoolPreq_f32<P>(x2.val[0], x2.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-            y1 = vPoolPostq_f32<P>(y1, post);
-            vst1q_f32(output_ptr0, y0);
-            vst1q_f32(output_ptr1, y1);
-
-            input_ptr0 += 8;
-            input_ptr1 += 8;
-            input_ptr2 += 8;
-            input_ptr3 += 8;
-            output_ptr0 += 4;
-            output_ptr1 += 4;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            x2 = vld2q_f32(input_ptr2);
-            x3 = vld2q_f32(input_ptr3);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y1 = vPoolPreq_f32<P>(x2.val[0], x2.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y1 = vPoolPreq_f32<P>(y1, x3.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-            y1 = vPoolPostq_f32<P>(y1, post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0, 0);
-                vst1q_lane_f32(output_ptr1, y1, 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                vst1_f32(output_ptr1, vget_low_f32(y1));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                vst1q_lane_f32(output_ptr0 + 2, y0, 2);
-                vst1_f32(output_ptr1, vget_low_f32(y1));
-                vst1q_lane_f32(output_ptr1 + 2, y1, 2);
-                break;
-            }
-            input_ptr0 += 2 * output_w_remain;
-            input_ptr1 += 2 * output_w_remain;
-            input_ptr2 += 2 * output_w_remain;
-            input_ptr3 += 2 * output_w_remain;
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-          }
-          // pad right
-          if (padding_r) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                float acc1 = PoolPre<P>(*input_ptr2, *input_ptr3);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-                *output_ptr1 = PoolPost<P>(acc1, 0.5f);
-              }
-              output_ptr0++;
-              output_ptr1++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + (valid_h & 0xfffffffe);
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - 2 * w;
-              if (padding >= 2) {
-                output_ptr0[w] = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                output_ptr0[w] = PoolPost<P>(acc0, 0.5f);
-              }
-            }
-            input_ptr0 += (padding_w & 0x1);
-            input_ptr1 += (padding_w & 0x1);
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1;
-          float32x4_t y0;
-          float32x4_t post = vdupq_n_f32(0.25f);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-            vst1q_f32(output_ptr0, y0);
-
-            input_ptr0 += 8;
-            input_ptr1 += 8;
-            output_ptr0 += 4;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr1);
-            y0 = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[0]);
-            y0 = vPoolPreq_f32<P>(y0, x1.val[1]);
-            y0 = vPoolPostq_f32<P>(y0, post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0, 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0));
-                vst1q_lane_f32(output_ptr0 + 2, y0, 2);
-                break;
-            }
-            input_ptr0 += 2 * output_w_remain;
-            input_ptr1 += 2 * output_w_remain;
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_r) {
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 2 - (padding_w + input_w);
-              if (padding >= 2) {
-                *output_ptr0 = 0.f;
-              } else {
-                float acc0 = PoolPre<P>(*input_ptr0, *input_ptr1);
-                *output_ptr0 = PoolPost<P>(acc0, 0.5f);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling2x2NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template struct Pooling2x2<MAX, 1>;
-template struct Pooling2x2<AVG, 1>;
-template struct Pooling2x2<MAX, 2>;
-template struct Pooling2x2<AVG, 2>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON__
-#endif  // POOL_OP
diff --git a/mobile/src/operators/math/pooling3x3.cpp b/mobile/src/operators/math/pooling3x3.cpp
deleted file mode 100644
index 3303dabb8d778d721e638b63bb6c141cf7446b6e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/pooling3x3.cpp
+++ /dev/null
@@ -1,1317 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-
-#include <arm_neon.h>
-#include "operators/math/pooling.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#define POOLING3X3_NORMAL_BORDER(start, end, exclusive)                  \
-  for (int w = start; w < end; ++w) {                                    \
-    const int w_in_start = -padding_w + w * Stride;                      \
-    const int w_in_end = w_in_start + 3;                                 \
-    const int w_start = w_in_start > 0 ? w_in_start : 0;                 \
-    const int w_end = w_in_end < input_w ? w_in_end : input_w;           \
-    PoolingVal<P> val;                                                   \
-    for (int h_in = h_start; h_in < h_end; ++h_in) {                     \
-      for (int w_in = w_start; w_in < w_end; ++w_in) {                   \
-        val += input[h_in * input_w + w_in];                             \
-      }                                                                  \
-    }                                                                    \
-    output_ptr[w] = exclusive ? val.Value() : val.ExclusiveSum(9) / 9.f; \
-  }
-
-template <PoolingType P, int Stride = 1>
-struct Pooling3x3NormalRowLoadInput {
-  inline void operator()(const float *input, float32x4x2_t &x0,  // NOLINT
-                         float32x4x2_t &x1, float32x4x2_t &x2,   // NOLINT
-                         float32x4x2_t &y0) {                    // NOLINT
-    x0.val[0] = vld1q_f32(input);
-    x0.val[1] = vld1q_f32(input + 4);
-    x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-    x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-    x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-    x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-    y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-    y0.val[0] = vPoolPreq_f32<P>(x1.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x1.val[1], y0.val[1]);
-    y0.val[0] = vPoolPreq_f32<P>(x2.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x2.val[1], y0.val[1]);
-  }
-};
-
-template <PoolingType P>
-struct Pooling3x3NormalRowLoadInput<P, 2> {
-  inline void operator()(const float *input, float32x4x2_t &x0,  // NOLINT
-                         float32x4x2_t &x1, float32x4x2_t &x2,   // NOLINT
-                         float32x4x2_t &y0) {                    // NOLINT
-    x0 = vld2q_f32(input);
-    x1 = vld2q_f32(input + 8);
-    x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-    x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-    x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-    x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-    x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-    x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-    y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-    y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-  }
-};
-
-template <PoolingType P, int Stride>
-inline void Pooling3x3NormalRow(const float *input, const int h_output,
-                                const int input_h, const int input_w,
-                                const int padding_h, const int padding_w,
-                                const int output_w, const bool exclusive,
-                                float *output) {
-  const int h_in_start = -padding_h + h_output * Stride;
-  const int h_in_end = h_in_start + 3;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int h_end = h_in_end < input_h ? h_in_end : input_h;
-
-  float *output_ptr = output + h_output * output_w;
-  if (h_end - h_start <= 0) {
-    memset(output_ptr, 0, output_w * sizeof(float));
-    return;
-  }
-
-  const int valid_w_start = (padding_w + Stride - 1) / Stride;
-  const int valid_w_end = (input_w + padding_w - 3) / Stride + 1;
-  const int valid_w = valid_w_end - valid_w_start;
-
-  // border left
-  POOLING3X3_NORMAL_BORDER(0, valid_w_start, exclusive)
-  // middle
-  int output_tiles = (valid_w_end - valid_w_start) / 6;
-  int output_tiles_w = output_tiles * 6;
-  Pooling3x3NormalRowLoadInput<P, Stride> PoolingCompute;
-  float32x4x2_t x0, x1, x2, y0;
-  float32x4_t post = exclusive ? vdupq_n_f32(1.f / (3 * (h_end - h_start)))
-                               : vdupq_n_f32(1.f / 9);
-  for (int w = 0; w < output_tiles_w; w += 6) {
-    int output_offset = valid_w_start + w;
-    int input_w_offset = output_offset * Stride - padding_w;
-    y0.val[0] = vPoolInitq_f32<P>();
-    y0.val[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0);
-    }
-    y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-    y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-    vst1q_f32(output_ptr + output_offset, y0.val[0]);
-    vst1_f32(output_ptr + output_offset + 4, vget_low_f32(y0.val[1]));
-  }
-  int remain = valid_w - output_tiles_w;
-  if (remain > 0) {
-    int remain_start = valid_w_start + output_tiles_w;
-    int input_w_offset = remain_start * Stride - padding_w;
-    float *output_ptr0 = output_ptr + remain_start;
-    y0.val[0] = vPoolInitq_f32<P>();
-    y0.val[1] = vPoolInitq_f32<P>();
-    for (int h_in = h_start; h_in < h_end; ++h_in) {
-      PoolingCompute(input + h_in * input_w + input_w_offset, x0, x1, x2, y0);
-    }
-    y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-    y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-    switch (remain) {
-      case 1:
-        vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-        break;
-      case 2:
-        vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-        break;
-      case 3:
-        vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-        vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-        break;
-      case 4:
-        vst1q_f32(output_ptr0, y0.val[0]);
-        break;
-      case 5:
-        vst1q_f32(output_ptr0, y0.val[0]);
-        vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-        break;
-    }
-  }
-  // border right
-  POOLING3X3_NORMAL_BORDER(valid_w_end, output_w, exclusive)
-}
-
-template <PoolingType P>
-struct Pooling3x3<P, 1> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings, const bool exclusive,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = padding_h;
-    int valid_h = input_h - 2;
-    int valid_h_end = valid_h_start + valid_h;
-    int valid_w_start = padding_w;
-    int valid_w = input_w - 2;
-    int valid_w_end = valid_w_start + valid_w;
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling3x3NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 6;
-        int output_w_remain = valid_w - output_w_tiles * 6;
-        for (int h = valid_h_start; h < valid_h_end - 3; h += 4) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          const float *input_ptr4 = input_ptr3 + input_w;
-          const float *input_ptr5 = input_ptr4 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          float *output_ptr2 = output_ptr1 + output_w;
-          float *output_ptr3 = output_ptr2 + output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, acc3, acc12, acc34, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-                output_ptr2[w] = 0.f;
-                output_ptr3[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc12 = vPoolPre_f32<P>(row1, row2);
-                acc34 = vPoolPre_f32<P>(row3, row4);
-                acc0 = vPoolPre_f32<P>(row0, acc12);
-                acc1 = vPoolPre_f32<P>(row3, acc12);
-                acc2 = vPoolPre_f32<P>(row2, acc34);
-                acc3 = vPoolPre_f32<P>(row5, acc34);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                acc3 = vpPoolPre_f32<P>(acc3, acc3);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                acc3 = vPoolPost_f32<P>(acc3, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-                vst1_lane_f32(output_ptr1 + w, acc1, 0);
-                vst1_lane_f32(output_ptr2 + w, acc2, 0);
-                vst1_lane_f32(output_ptr3 + w, acc3, 0);
-                row0 = vext_f32(pad0, row0, 1);
-                row1 = vext_f32(pad0, row1, 1);
-                row2 = vext_f32(pad0, row2, 1);
-                row3 = vext_f32(pad0, row3, 1);
-                row4 = vext_f32(pad0, row4, 1);
-                row5 = vext_f32(pad0, row5, 1);
-              }
-            }
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-            output_ptr2 += valid_w_start;
-            output_ptr3 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2;
-          float32x4x2_t y0, y1, y2;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y2.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y2.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr3);
-            x0.val[1] = vld1q_f32(input_ptr3 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(y0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(y0.val[1], y2.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr1, y1.val[0]);
-            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-            vst1q_f32(output_ptr2, y2.val[0]);
-            vst1_f32(output_ptr2 + 4, vget_low_f32(y2.val[1]));
-
-            x0.val[0] = vld1q_f32(input_ptr5);
-            x0.val[1] = vld1q_f32(input_ptr5 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr3, y0.val[0]);
-            vst1_f32(output_ptr3 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            input_ptr2 += 6;
-            input_ptr3 += 6;
-            input_ptr4 += 6;
-            input_ptr5 += 6;
-            output_ptr0 += 6;
-            output_ptr1 += 6;
-            output_ptr2 += 6;
-            output_ptr3 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            float32x4x2_t y3;
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y2.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y2.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr3);
-            x0.val[1] = vld1q_f32(input_ptr3 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y3.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y3.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y3.val[1], y1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(y3.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(y3.val[1], y2.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr4);
-            x0.val[1] = vld1q_f32(input_ptr4 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y3.val[0] = vPoolPreq_f32<P>(x0.val[0], y3.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x0.val[1], y3.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-
-            x0.val[0] = vld1q_f32(input_ptr5);
-            x0.val[1] = vld1q_f32(input_ptr5 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y3.val[0] = vPoolPreq_f32<P>(x0.val[0], y3.val[0]);
-            y3.val[1] = vPoolPreq_f32<P>(x0.val[1], y3.val[1]);
-            y3.val[0] = vPoolPostq_f32<P>(y3.val[0], post);
-            y3.val[1] = vPoolPostq_f32<P>(y3.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                vst1q_lane_f32(output_ptr1, y1.val[0], 0);
-                vst1q_lane_f32(output_ptr2, y2.val[0], 0);
-                vst1q_lane_f32(output_ptr3, y3.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1_f32(output_ptr3, vget_low_f32(y3.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2);
-                vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2);
-                vst1q_lane_f32(output_ptr3 + 2, y3.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_f32(output_ptr3, y3.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0);
-                vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0);
-                vst1q_lane_f32(output_ptr3 + 4, y3.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            input_ptr2 += output_w_remain;
-            input_ptr3 += output_w_remain;
-            input_ptr4 += output_w_remain;
-            input_ptr5 += output_w_remain;
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-            output_ptr2 += output_w_remain;
-            output_ptr3 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, acc3, acc12, acc34, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-                *output_ptr2 = 0.f;
-                *output_ptr3 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc12 = vPoolPre_f32<P>(row1, row2);
-                acc34 = vPoolPre_f32<P>(row3, row4);
-                acc0 = vPoolPre_f32<P>(row0, acc12);
-                acc1 = vPoolPre_f32<P>(row3, acc12);
-                acc2 = vPoolPre_f32<P>(row2, acc34);
-                acc3 = vPoolPre_f32<P>(row5, acc34);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                acc3 = vpPoolPre_f32<P>(acc3, acc3);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                acc3 = vPoolPost_f32<P>(acc3, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-                vst1_lane_f32(output_ptr1, acc1, 0);
-                vst1_lane_f32(output_ptr2, acc2, 0);
-                vst1_lane_f32(output_ptr3, acc3, 0);
-                row0 = vext_f32(row0, pad0, 1);
-                row1 = vext_f32(row1, pad0, 1);
-                row2 = vext_f32(row2, pad0, 1);
-                row3 = vext_f32(row3, pad0, 1);
-                row4 = vext_f32(row4, pad0, 1);
-                row5 = vext_f32(row5, pad0, 1);
-              }
-              output_ptr0++;
-              output_ptr1++;
-              output_ptr2++;
-              output_ptr3++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + (valid_h & 0xFFFFFFFC);
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - w;
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-                row0 = vext_f32(pad0, row0, 1);
-                row1 = vext_f32(pad0, row1, 1);
-                row2 = vext_f32(pad0, row2, 1);
-              }
-            }
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2, y0;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 6;
-            input_ptr1 += 6;
-            input_ptr2 += 6;
-            output_ptr0 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0.val[0] = vld1q_f32(input_ptr0);
-            x0.val[1] = vld1q_f32(input_ptr0 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr1);
-            x0.val[1] = vld1q_f32(input_ptr1 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0.val[0] = vld1q_f32(input_ptr2);
-            x0.val[1] = vld1q_f32(input_ptr2 + 4);
-            x1.val[0] = vextq_f32(x0.val[0], x0.val[1], 1);
-            x1.val[1] = vextq_f32(x0.val[1], x0.val[1], 1);
-            x2.val[0] = vextq_f32(x0.val[0], x0.val[1], 2);
-            x2.val[1] = vextq_f32(x0.val[1], x0.val[1], 2);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x1.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            // restore
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                break;
-            }
-            input_ptr0 += output_w_remain;
-            input_ptr1 += output_w_remain;
-            input_ptr2 += output_w_remain;
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-                row0 = vext_f32(row0, pad0, 1);
-                row1 = vext_f32(row1, pad0, 1);
-                row2 = vext_f32(row2, pad0, 1);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // pad bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling3x3NormalRow<P, 1>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template <PoolingType P>
-struct Pooling3x3<P, 2> {
-  inline void operator()(const framework::Tensor &input,
-                         const std::vector<int> &paddings, const bool exclusive,
-                         framework::Tensor *output) {
-    const float *input_data = input.data<float>();
-    float *output_data = output->mutable_data<float>();
-    int input_h = input.dims()[2];
-    int input_w = input.dims()[3];
-    int output_h = output->dims()[2];
-    int output_w = output->dims()[3];
-    int padding_h = paddings[0];
-    int padding_w = paddings[1];
-    int image_size = input_h * input_w;
-    int out_image_size = output_h * output_w;
-    int valid_h_start = (padding_h + 1) / 2;
-    int valid_h_end = (input_h + padding_h - 1) / 2;
-    int valid_h = valid_h_end - valid_h_start;
-    int valid_w_start = (padding_w + 1) / 2;
-    int valid_w_end = (input_w + padding_w - 1) / 2;
-    int valid_w = valid_w_end - valid_w_start;
-
-    int padding_height = input_h + 2 * padding_h;
-    int padding_width = input_w + 2 * padding_w;
-    bool ceil_mode = (((padding_height - 1) / 2) < output_h) ||
-                     (((padding_width - 1) / 2) < output_w);
-    int padding_b =
-        padding_h + (ceil_mode ? 2 * output_h - (padding_height - 1) : 0);
-    int padding_r =
-        padding_w + (ceil_mode ? 2 * output_w - (padding_width - 1) : 0);
-    // for pad left
-    int valid_input_w_start = (valid_w_start << 1) - padding_w;
-
-    #pragma omp parallel for collapse(2)
-    for (int batch = 0; batch < output->dims()[0]; ++batch) {
-      for (int c = 0; c < output->dims()[1]; ++c) {
-        int channel = batch * output->dims()[1] + c;
-        const float *input_ptr = input_data + channel * image_size;
-        float *output_ptr = output_data + channel * out_image_size;
-        // top
-        for (int h = 0; h < valid_h_start; ++h) {
-          Pooling3x3NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-        // valid
-        int output_w_tiles = valid_w / 6;
-        int output_w_remain = valid_w - output_w_tiles * 6;
-        for (int h = valid_h_start; h < valid_h_end - 2; h += 3) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          const float *input_ptr3 = input_ptr2 + input_w;
-          const float *input_ptr4 = input_ptr3 + input_w;
-          const float *input_ptr5 = input_ptr4 + input_w;
-          const float *input_ptr6 = input_ptr5 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          float *output_ptr1 = output_ptr0 + output_w;
-          float *output_ptr2 = output_ptr1 + output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t row6 = vld1_f32(input_ptr6);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - (w << 1);
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-                output_ptr1[w] = 0.f;
-                output_ptr2[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc1 = vPoolPre_f32<P>(row2, row3);
-                acc2 = vPoolPre_f32<P>(row4, row5);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc1 = vPoolPre_f32<P>(acc1, row4);
-                acc2 = vPoolPre_f32<P>(acc2, row6);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                  acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                  acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-                vst1_lane_f32(output_ptr1 + w, acc1, 0);
-                vst1_lane_f32(output_ptr2 + w, acc2, 0);
-              }
-            }
-            input_ptr0 += valid_input_w_start;
-            input_ptr1 += valid_input_w_start;
-            input_ptr2 += valid_input_w_start;
-            input_ptr3 += valid_input_w_start;
-            input_ptr4 += valid_input_w_start;
-            input_ptr5 += valid_input_w_start;
-            input_ptr6 += valid_input_w_start;
-            output_ptr0 += valid_w_start;
-            output_ptr1 += valid_w_start;
-            output_ptr2 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2;
-          float32x4x2_t y0, y1, y2;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            x0 = vld2q_f32(input_ptr3);
-            x1 = vld2q_f32(input_ptr3 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], y1.val[1]);
-
-            x0 = vld2q_f32(input_ptr4);
-            x1 = vld2q_f32(input_ptr4 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y0.val[1], y1.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-            vst1q_f32(output_ptr1, y1.val[0]);
-            vst1_f32(output_ptr1 + 4, vget_low_f32(y1.val[1]));
-
-            x0 = vld2q_f32(input_ptr5);
-            x1 = vld2q_f32(input_ptr5 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr6);
-            x1 = vld2q_f32(input_ptr6 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr2, y0.val[0]);
-            vst1_f32(output_ptr2 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 12;
-            input_ptr1 += 12;
-            input_ptr2 += 12;
-            input_ptr3 += 12;
-            input_ptr4 += 12;
-            input_ptr5 += 12;
-            input_ptr6 += 12;
-            output_ptr0 += 6;
-            output_ptr1 += 6;
-            output_ptr2 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(y1.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(y1.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-
-            x0 = vld2q_f32(input_ptr3);
-            x1 = vld2q_f32(input_ptr3 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(x0.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(x0.val[1], y1.val[1]);
-
-            x0 = vld2q_f32(input_ptr4);
-            x1 = vld2q_f32(input_ptr4 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y1.val[0] = vPoolPreq_f32<P>(y2.val[0], y1.val[0]);
-            y1.val[1] = vPoolPreq_f32<P>(y2.val[1], y1.val[1]);
-            y1.val[0] = vPoolPostq_f32<P>(y1.val[0], post);
-            y1.val[1] = vPoolPostq_f32<P>(y1.val[1], post);
-
-            x0 = vld2q_f32(input_ptr5);
-            x1 = vld2q_f32(input_ptr5 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-
-            x0 = vld2q_f32(input_ptr6);
-            x1 = vld2q_f32(input_ptr6 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y2.val[0] = vPoolPreq_f32<P>(x0.val[0], y2.val[0]);
-            y2.val[1] = vPoolPreq_f32<P>(x0.val[1], y2.val[1]);
-            y2.val[0] = vPoolPostq_f32<P>(y2.val[0], post);
-            y2.val[1] = vPoolPostq_f32<P>(y2.val[1], post);
-
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                vst1q_lane_f32(output_ptr1, y1.val[0], 0);
-                vst1q_lane_f32(output_ptr2, y2.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1_f32(output_ptr1, vget_low_f32(y1.val[0]));
-                vst1_f32(output_ptr2, vget_low_f32(y2.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                vst1q_lane_f32(output_ptr1 + 2, y1.val[0], 2);
-                vst1q_lane_f32(output_ptr2 + 2, y2.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_f32(output_ptr1, y1.val[0]);
-                vst1q_f32(output_ptr2, y2.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                vst1q_lane_f32(output_ptr1 + 4, y1.val[1], 0);
-                vst1q_lane_f32(output_ptr2 + 4, y2.val[1], 0);
-                break;
-            }
-            input_ptr0 += (output_w_remain << 1);
-            input_ptr1 += (output_w_remain << 1);
-            input_ptr2 += (output_w_remain << 1);
-            input_ptr3 += (output_w_remain << 1);
-            input_ptr4 += (output_w_remain << 1);
-            input_ptr5 += (output_w_remain << 1);
-            input_ptr6 += (output_w_remain << 1);
-            output_ptr0 += output_w_remain;
-            output_ptr1 += output_w_remain;
-            output_ptr2 += output_w_remain;
-          }
-          // pad right
-          if (padding_r > 0) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t row3 = vld1_f32(input_ptr3);
-            float32x2_t row4 = vld1_f32(input_ptr4);
-            float32x2_t row5 = vld1_f32(input_ptr5);
-            float32x2_t row6 = vld1_f32(input_ptr6);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, acc1, acc2, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-                *output_ptr1 = 0.f;
-                *output_ptr2 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc1 = vPoolPre_f32<P>(row2, row3);
-                acc2 = vPoolPre_f32<P>(row4, row5);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                acc1 = vPoolPre_f32<P>(acc1, row4);
-                acc2 = vPoolPre_f32<P>(acc2, row6);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                  acc1 = vpPoolPre_f32<P>(acc1, acc1);
-                  acc2 = vpPoolPre_f32<P>(acc2, acc2);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                acc1 = vPoolPost_f32<P>(acc1, post);
-                acc2 = vPoolPost_f32<P>(acc2, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-                vst1_lane_f32(output_ptr1, acc1, 0);
-                vst1_lane_f32(output_ptr2, acc2, 0);
-              }
-              output_ptr0++;
-              output_ptr1++;
-              output_ptr2++;
-            }
-          }
-        }
-        // remain height
-        int start_h = valid_h_start + valid_h / 3 * 3;
-        for (int h = start_h; h < valid_h_end; ++h) {
-          const float *input_ptr0 = input_ptr + (2 * h - padding_h) * input_w;
-          const float *input_ptr1 = input_ptr0 + input_w;
-          const float *input_ptr2 = input_ptr1 + input_w;
-          float *output_ptr0 = output_ptr + h * output_w;
-          // pad left
-          if (padding_w) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_start - 1; w >= 0; --w) {
-              int padding = padding_w - (w << 1);
-              if (padding >= 3) {
-                output_ptr0[w] = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0 + w, acc0, 0);
-              }
-            }
-            input_ptr0 += valid_input_w_start;
-            input_ptr1 += valid_input_w_start;
-            input_ptr2 += valid_input_w_start;
-            output_ptr0 += valid_w_start;
-          }
-          // valid
-          float32x4x2_t x0, x1, x2, y0;
-          float32x4_t post = vdupq_n_f32(1.f / 9);
-          for (int loop = 0; loop < output_w_tiles; ++loop) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            vst1q_f32(output_ptr0, y0.val[0]);
-            vst1_f32(output_ptr0 + 4, vget_low_f32(y0.val[1]));
-
-            input_ptr0 += 12;
-            input_ptr1 += 12;
-            input_ptr2 += 12;
-            output_ptr0 += 6;
-          }
-          // remain width
-          if (output_w_remain > 0) {
-            x0 = vld2q_f32(input_ptr0);
-            x1 = vld2q_f32(input_ptr0 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-
-            x0 = vld2q_f32(input_ptr1);
-            x1 = vld2q_f32(input_ptr1 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-
-            x0 = vld2q_f32(input_ptr2);
-            x1 = vld2q_f32(input_ptr2 + 8);
-            x2.val[0] = vextq_f32(x0.val[0], x1.val[0], 1);
-            x2.val[1] = vextq_f32(x1.val[0], x1.val[0], 1);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x0.val[1]);
-            x0.val[1] = vPoolPreq_f32<P>(x1.val[0], x1.val[1]);
-            x0.val[0] = vPoolPreq_f32<P>(x0.val[0], x2.val[0]);
-            x0.val[1] = vPoolPreq_f32<P>(x0.val[1], x2.val[1]);
-            y0.val[0] = vPoolPreq_f32<P>(x0.val[0], y0.val[0]);
-            y0.val[1] = vPoolPreq_f32<P>(x0.val[1], y0.val[1]);
-            y0.val[0] = vPoolPostq_f32<P>(y0.val[0], post);
-            y0.val[1] = vPoolPostq_f32<P>(y0.val[1], post);
-            // restore
-            switch (output_w_remain) {
-              case 1:
-                vst1q_lane_f32(output_ptr0, y0.val[0], 0);
-                break;
-              case 2:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                break;
-              case 3:
-                vst1_f32(output_ptr0, vget_low_f32(y0.val[0]));
-                vst1q_lane_f32(output_ptr0 + 2, y0.val[0], 2);
-                break;
-              case 4:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                break;
-              case 5:
-                vst1q_f32(output_ptr0, y0.val[0]);
-                vst1q_lane_f32(output_ptr0 + 4, y0.val[1], 0);
-                break;
-            }
-            input_ptr0 += (output_w_remain << 1);
-            input_ptr1 += (output_w_remain << 1);
-            input_ptr2 += (output_w_remain << 1);
-            output_ptr0 += output_w_remain;
-          }
-          // pad right
-          if (padding_r > 0) {
-            float32x2_t row0 = vld1_f32(input_ptr0);
-            float32x2_t row1 = vld1_f32(input_ptr1);
-            float32x2_t row2 = vld1_f32(input_ptr2);
-            float32x2_t pad0 = vPoolInit_f32<P>();
-            float32x2_t acc0, post;
-            for (int w = valid_w_end; w < output_w; ++w) {
-              int padding = 2 * w + 3 - (padding_w + input_w);
-              if (padding >= 3) {
-                *output_ptr0 = 0.f;
-              } else {
-                post = exclusive ? vdup_n_f32(1.f / (3 * (3 - padding)))
-                                 : vdup_n_f32(1.f / 9);
-                acc0 = vPoolPre_f32<P>(row0, row1);
-                acc0 = vPoolPre_f32<P>(acc0, row2);
-                if (padding == 1) {
-                  acc0 = vpPoolPre_f32<P>(acc0, acc0);
-                }
-                acc0 = vPoolPost_f32<P>(acc0, post);
-                vst1_lane_f32(output_ptr0, acc0, 0);
-              }
-              output_ptr0++;
-            }
-          }
-        }
-        // bottom
-        for (int h = valid_h_end; h < output_h; ++h) {
-          Pooling3x3NormalRow<P, 2>(input_ptr, h, input_h, input_w, padding_h,
-                                    padding_w, output_w, exclusive, output_ptr);
-        }
-      }
-    }
-  }
-};
-
-template struct Pooling3x3<MAX, 1>;
-template struct Pooling3x3<AVG, 1>;
-template struct Pooling3x3<MAX, 2>;
-template struct Pooling3x3<AVG, 2>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // __ARM_NEON
-#endif  // POOL_OP
diff --git a/mobile/src/operators/math/quantize.h b/mobile/src/operators/math/quantize.h
deleted file mode 100644
index 9f6b2437f53868341586efdf04b2c8a2a66dac4a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/quantize.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#pragma once
-
-#include <cmath>
-#include "common/types.h"
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int8_t Round(const float &x) {
-  return static_cast<int8_t>(x);
-}
-
-template <>
-inline int8_t Round<ROUND_NEAREST_AWAY_ZERO>(const float &x) {
-  return std::round(x);
-}
-
-template <>
-inline int8_t Round<ROUND_NEAREST_TO_EVEN>(const float &x) {
-  float v = std::round(x);
-  int32_t q = static_cast<int32_t>(v);
-  if (fabs(fabs(q - v) - 0.5) <= 0) {
-    if (abs(q) % 2 != 0) {
-      q = q + ((q > 0) ? -1 : 1);
-    }
-  }
-  return static_cast<int8_t>(q);
-}
-
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-template <RoundType R = ROUND_NEAREST_TOWARDS_ZERO>
-inline int32x4_t vRoundq_f32(const float32x4_t &x) {
-  return vcvtq_s32_f32(x);
-}
-
-template <>
-inline int32x4_t vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(const float32x4_t &x) {
-#if __aarch64__
-  return vcvtaq_s32_f32(x);
-#else
-  float32x4_t plus = vdupq_n_f32(0.5);
-  float32x4_t minus = vdupq_n_f32(-0.5);
-  float32x4_t zero = vdupq_n_f32(0);
-  uint32x4_t more_than_zero = vcgtq_f32(x, zero);
-  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
-  temp = vaddq_f32(x, temp);
-  int32x4_t ret = vcvtq_s32_f32(temp);
-  return ret;
-#endif
-}
-
-template <>
-inline int32x4_t vRoundq_f32<ROUND_NEAREST_TO_EVEN>(const float32x4_t &x) {
-#if __aarch64__
-  return vcvtnq_s32_f32(x);
-#else
-  float32x4_t point5 = vdupq_n_f32(0.5);
-  int32x4_t one = vdupq_n_s32(1);
-  int32x4_t zero = vdupq_n_s32(0);
-
-  int32x4_t rnd = math::vRoundq_f32<ROUND_NEAREST_AWAY_ZERO>(x);
-  float32x4_t frnd = vcvtq_f32_s32(rnd);
-  frnd = vsubq_f32(frnd, x);
-  frnd = vabsq_f32(frnd);
-  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
-  int32x4_t abs_rnd = vabsq_s32(rnd);
-  abs_rnd = vandq_s32(abs_rnd, one);
-  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
-  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
-  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
-  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
-  mask = veorq_u32(more_than_zero, mask);
-  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
-  mask = vaddq_u32(more_than_zero, mask);
-  int32x4_t smask = vreinterpretq_s32_u32(mask);
-  smask = vsubq_s32(smask, one);
-  rnd = vaddq_s32(rnd, smask);
-  return rnd;
-#endif
-}
-#endif  // __ARM_NEON__
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/math/selected_rows_functor.h b/mobile/src/operators/math/selected_rows_functor.h
deleted file mode 100644
index f8b5521e4d19fd3199e7b05a902c98b731c9fbd0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/selected_rows_functor.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "framework/selected_rows.h"
-
-#define INLINE_FOR2(sizei, sizej)     \
-  for (int64_t i = 0; i < sizei; i++) \
-    for (int64_t j = 0; j < sizej; j++)
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// SelectedRows + SelectedRows will simplely concat value and rows.
-// The real computation happens in dealing with LoDTensor.
-// template <typename T>
-// struct SelectedRowsAdd {
-//  void operator()(
-//                  const framework::SelectedRows& input1,
-//                  const framework::SelectedRows& input2,
-//                  framework::SelectedRows* output);
-//};
-//
-// template <typename T>
-// struct SelectedRowsAddTensor {
-//  void operator()(
-//                  const framework::SelectedRows& input1,
-//                  const framework::Tensor& input2, framework::Tensor* output);
-//};
-
-// input2 = input1 + input2
-template <typename T>
-struct SelectedRowsAddTo {
-  void operator()(const framework::SelectedRows& input1,
-                  const int64_t input2_offset,
-                  framework::SelectedRows* input2) {
-    auto in1_height = input1.height();
-    PADDLE_MOBILE_ENFORCE(in1_height == input2->height(), "height error");
-
-    auto& in1_rows = input1.rows();
-    auto& in2_rows = *(input2->mutable_rows());
-
-    auto& in1_value = input1.value();
-    auto* in2_value = input2->mutable_value();
-
-    // concat rows
-    in2_rows.Extend(in1_rows.begin(), in1_rows.end());
-
-    //    auto in1_place = input1.place();
-    //    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
-    //    auto in2_place = input2->place();
-    //    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* in2_data = in2_value->data<T>();
-    memory::Copy(in2_data + input2_offset, in1_data,
-                 in1_value.numel() * sizeof(T));
-  }
-};
-
-// input2 = input1 + input2
-template <typename T>
-struct SelectedRowsAddToTensor {
-  void operator()(const framework::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    auto in1_height = input1.height();
-    auto in2_dims = input2->dims();
-    PADDLE_MOBILE_ENFORCE(in1_height == in2_dims[0], "height != dims[0]");
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_MOBILE_ENFORCE(in1_row_numel == input2->numel() / in1_height,
-                          "row_numel error");
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
-// namespace scatter {
-//// functors for manuplating SelectedRows data
-// template <typename T>
-// struct MergeAdd {
-//  // unary functor, merge by adding duplicated rows in
-//  // the input SelectedRows object.
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input);
-//};
-
-// template <typename T>
-// struct Add {
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input1,
-//                                     const framework::SelectedRows& input2) {
-//    framework::SelectedRows out;
-//    out.set_rows(input1.rows());
-//    out.set_height(input1.height());
-//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-//                                         );
-//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-//    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
-//    return out;
-//  }
-//};
-
-// template <typename T>
-// struct Mul {
-//  // multiply two SelectedRows
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input1,
-//                                     const framework::SelectedRows& input2) {
-//    framework::SelectedRows out;
-//    out.set_rows(input1.rows());
-//    out.set_height(input1.height());
-//    out.mutable_value()->mutable_data<T>(input1.value().dims()
-//                                         );
-//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-//    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
-//    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
-//    return out;
-//  }
-//  // multiply scalar to SelectedRows
-//  framework::SelectedRows operator()(
-//                                     const framework::SelectedRows& input1,
-//                                     const T input2) {
-//    framework::SelectedRows out;
-//    out.set_rows(input1.rows());
-//    out.set_height(input1.height());
-//    out.mutable_value()->mutable_data<T>(input1.value().dims(),
-//                                         );
-//    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
-//    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
-//    e_out.device(*context.eigen_device()) = input2 * e_in1;
-//    return out;
-//  }
-//};
-
-enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
-
-// out = seleted_rows_in / tensor
-template <typename T>
-struct UpdateToTensor {
-  void operator()(const ScatterOps& op, const framework::SelectedRows& input1,
-                  framework::Tensor* input2);
-};
-
-// namespace scatter
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/sequence2batch.cpp b/mobile/src/operators/math/sequence2batch.cpp
deleted file mode 100644
index 097a258dddd513294cd1c1d2f4c9ddb0dd530052..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/sequence2batch.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/sequence2batch.h"
-#include <cstring>
-#include "common/types.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename T>
-class CopyMatrixRowsFunctor<CPU, T> {
- public:
-  void operator()(const framework::Tensor& src, std::vector<size_t> index_lod,
-                  framework::Tensor* dst, bool is_src_index) {
-    size_t* index = index_lod.data();
-    auto src_dims = src.dims();
-    auto dst_dims = dst->dims();
-    PADDLE_MOBILE_ENFORCE((src_dims.size() == 2UL),
-                          "The src must be matrix with rank 2.");
-    PADDLE_MOBILE_ENFORCE((dst_dims.size() == 2UL),
-                          "The dst must be matrix with rank 2.");
-    PADDLE_MOBILE_ENFORCE((src_dims[1] == dst_dims[1]),
-                          "The width of src and dst must be same.");
-    auto height = dst_dims[0];
-    auto width = dst_dims[1];
-    auto* src_data = src.data<T>();
-    auto* dst_data = dst->data<T>();
-    for (int i = 0; i < height; ++i) {
-      if (is_src_index) {
-        memcpy(dst_data + i * width, src_data + index[i] * width,
-               width * sizeof(T));
-      } else {
-        memcpy(dst_data + index[i] * width, src_data + i * width,
-               width * sizeof(T));
-      }
-    }
-  }
-};
-
-template class CopyMatrixRowsFunctor<CPU, float>;
-
-template class LoDTensor2BatchFunctor<CPU, float>;
-template class Batch2LoDTensorFunctor<CPU, float>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/sequence2batch.h b/mobile/src/operators/math/sequence2batch.h
deleted file mode 100644
index 537f2326d02847f13d7ed4e54db2ae166498394d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/sequence2batch.h
+++ /dev/null
@@ -1,169 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-#include <vector>
-#include "framework/lod_tensor.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-template <typename DeviceType, typename T>
-class CopyMatrixRowsFunctor {
- public:
-  // If is_src_index is true,
-  // copy the indexed rows of input src to the output dst.
-  // If is_src_index is false,
-  // copy the input src to the indexed rows of output dst.
-  // The indexed rows are based on the input index.
-  void operator()(const framework::Tensor& src, std::vector<size_t> index_lod,
-                  framework::Tensor* dst, bool is_src_index);
-};
-
-template <typename DeviceType, typename T>
-class LoDTensor2BatchFunctor {
-  // Calculate the length of each sequence and
-  // sort sequence index by the length.
-  // example:  sequences = {s0, s1, s2}
-  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
-  //
-  struct SeqInfo {
-    SeqInfo(int start, int length, int seq_idx)
-        : start(start), length(length), seq_idx(seq_idx) {}
-    int start;
-    int length;
-    int seq_idx;
-  };
-
- public:
-  void operator()(const framework::LoDTensor& lod_tensor,
-                  framework::LoDTensor* batch, bool is_cal_batch_lod,
-                  bool is_reverse = false) {
-    if (!is_cal_batch_lod) {
-      auto lods = batch->lod();
-      PADDLE_MOBILE_ENFORCE(
-          (lods.size() > 2UL),
-          "The LoD of LoDTensor should inlcude at least 2-level "
-          "sequence information.");
-      PADDLE_MOBILE_ENFORCE(
-          (lods[1].size() == static_cast<size_t>(lod_tensor.dims()[0])),
-          "The LoD information should be consistent with the dims.");
-      CopyMatrixRowsFunctor<DeviceType, T> to_batch;
-      to_batch(lod_tensor, lods[1], batch, true);
-      return;
-    }
-
-    auto lods = lod_tensor.lod();
-    PADDLE_MOBILE_ENFORCE((lods.size() == 1UL),
-                          "Only support 1 level sequence, but %d is given",
-                          lods.size());
-
-    const auto& lod = lods[0];
-    std::vector<SeqInfo> seq_info;
-    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
-      int length = lod[seq_id + 1] - lod[seq_id];
-      seq_info.emplace_back(lod[seq_id], length, seq_id);
-    }
-
-    std::sort(seq_info.begin(), seq_info.end(),
-              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
-
-    // Calculate the start position of each batch.
-    // example:  sequences = {s0, s1, s2}
-    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
-    //           num_batch = 5,
-    //           batchIndex = {b0, b1, b2, b3, b4}
-    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
-    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
-    //              batch_start_positions[0] = len(b0)
-    //              batch_start_positions[1] = len(b0) + len(b1)
-    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
-    //              ...
-    //           seq2batch_idx[12] = {4, 0, 9,
-    //                                5, 1, 10,
-    //                                6, 2, 11,
-    //                                7, 3,
-    //                                8}
-    //           seq_order = {1, 0, 2}, the sort order.
-    //               where 1 is the second sequence,
-    //                     0 is the first sequence,
-    //                     2 is the third sequence.
-    // The num_batch represents batch size after rearranging the
-    // input LodTensor. It is also the maximum length of input sequence.
-
-    framework::LoD batch_lods;
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-    batch_lods.emplace_back(std::vector<size_t>{0});
-
-    // batch_lods[0] is the start positions for batch LoDTensor
-    int num_batch = seq_info[0].length;
-    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
-    // batch_lods[1] is the raw index in the input LoDTensor
-    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
-    // batch_lods[2] is the sort order for the input LoDTensor.
-    batch_lods[2].resize(seq_info.size());
-
-    size_t* batch_starts = batch_lods[0].data();
-    size_t* seq2batch_idx = batch_lods[1].data();
-    batch_starts[0] = 0;
-    for (int n = 0; n < num_batch; n++) {
-      auto batch_id = static_cast<int>(batch_starts[n]);
-      for (size_t i = 0; i < seq_info.size(); ++i) {
-        int seq_len = seq_info[i].length;
-        int start = seq_info[i].start;
-        if (n < seq_len) {
-          seq2batch_idx[batch_id] =
-              is_reverse ? start + seq_len - 1 - n : start + n;
-          batch_id++;
-        } else {
-          break;
-        }
-      }
-      batch_starts[n + 1] = static_cast<size_t>(batch_id);
-    }
-    size_t* seq_order = batch_lods[2].data();
-    for (size_t i = 0; i < seq_info.size(); ++i) {
-      seq_order[i] = seq_info[i].seq_idx;
-    }
-    batch->set_lod(batch_lods);
-
-    CopyMatrixRowsFunctor<DeviceType, T> to_batch;
-    to_batch(lod_tensor, batch_lods[1], batch, true);
-  }
-};
-
-template <typename DeviceType, typename T>
-class Batch2LoDTensorFunctor {
- public:
-  void operator()(const framework::LoDTensor& batch,
-                  framework::LoDTensor* lod_tensor) {
-    auto in_lod = batch.lod();
-    PADDLE_MOBILE_ENFORCE(
-        (in_lod.size() > 2UL),
-        "The LoD of LoDTensor should inlcude at least 2-level "
-        "sequence information.");
-    PADDLE_MOBILE_ENFORCE(
-        (in_lod[1].size() == static_cast<size_t>(lod_tensor->dims()[0])),
-        "The LoD information should be consistent with the dims.");
-    CopyMatrixRowsFunctor<DeviceType, T> to_seq;
-    to_seq(batch, in_lod[1], lod_tensor, false);
-  }
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_conv3x3.cpp b/mobile/src/operators/math/slidingwindow_conv3x3.cpp
deleted file mode 100644
index 0f4fbcbd9350a0d0d70dab0eb6ca41e4d5684e1d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/slidingwindow_conv3x3.cpp
+++ /dev/null
@@ -1,5668 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/slidingwindow_conv3x3.h"
-#include <vector>
-#include "framework/context.h"
-#include "operators/math/slidingwindow_utils.h"
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-template <>
-void SlidingwindowConv3x3s1<float, float>(const framework::Tensor *input,
-                                          const framework::Tensor *filter,
-                                          const std::vector<int> &paddings,
-                                          framework::Tensor *output) {
-  const int batch = input->dims()[0];
-  const int input_ch = input->dims()[1];
-  const int input_h = input->dims()[2];
-  const int input_w = input->dims()[3];
-  const int output_ch = output->dims()[1];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float *filter_data = filter->data<float>();
-
-  const int in_ch_size = input_h * input_w;
-  const int in_batch_size = input_ch * in_ch_size;
-  const int out_ch_size = output_h * output_w;
-  const int out_batch_size = output_ch * out_ch_size;
-  const int out_size = batch * out_batch_size;
-  const int filter_ch_size = 9;
-  const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3);
-  const int pad_filter_start =
-      2 * padding_h * (2 * padding_w + 3) + 2 * padding_w;
-  const int pad_filter_w = 3 + padding_w * 2;
-  bool if_nopadding = false;
-
-#if __ARM_NEON
-  float *out_ptr = output_data;
-  int remain = out_size & 0x3;
-  float32x4_t _zero = vdupq_n_f32(0.0);
-
-  for (int i = 0; i < out_size; i += 4) {
-    vst1q_f32(out_ptr, _zero);
-    out_ptr += 4;
-  }
-  switch (remain) {
-    case 1:
-      vst1q_lane_f32(out_ptr, _zero, 0);
-      break;
-    case 2:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      break;
-    case 3:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      vst1q_lane_f32(out_ptr + 2, _zero, 0);
-      break;
-  }
-#else
-#pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    output_data[i] = 0;
-  }
-#endif
-  if (padding_h == 0 && padding_w == 0) {
-    if_nopadding = true;
-  }
-
-  for (int b = 0; b < batch; ++b) {
-#pragma omp parallel for
-    for (int o_c = 0; o_c < output_ch - 1; o_c += 2) {
-      bool issamefilter;
-      const float *f1;
-      const float *f1_c2;
-      const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4;
-      const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3;
-      const float *pad_filter0_c2, *pad_filter1_c2, *pad_filter2_c2,
-          *pad_filter3_c2;
-      float pad_filter_arr[pad_filter_ch_size];
-      float pad_filter_arr_c2[pad_filter_ch_size];
-
-      float *output_data_ch;
-      float *output_data_ch_2;
-      const float *input_data_ch;
-      const float *filter_data_ch;
-      const float *filter_data_ch_c2;
-
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-      filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch;
-
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-      output_data_ch_2 = output_data + (o_c + 1) * out_ch_size;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        f1 = filter_data_ch;
-        f1_c2 = filter_data_ch_c2;
-
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2));
-          for (int i = 0; i < 9; i++) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-            pad_filter_arr_c2[j] = filter_data_ch_c2[i];
-          }
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter0 = pad_filter1 - pad_filter_w;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-
-          pad_filter1_c2 = pad_filter_arr_c2;
-          pad_filter1_c2 += pad_filter_start;
-          pad_filter0_c2 = pad_filter1_c2 - pad_filter_w;
-          pad_filter2_c2 = pad_filter1_c2 + pad_filter_w;
-          pad_filter3_c2 = pad_filter2_c2 + pad_filter_w;
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-
-          pad_filter1_c2 = filter_data_ch_c2;
-          pad_filter2_c2 = pad_filter1_c2 + 3;
-          pad_filter3_c2 = pad_filter2_c2 + 3;
-        }
-        float *out_ptr1, *out_ptr2;
-        float *out_ptr1_c2, *out_ptr2_c2;
-
-        out_ptr1 = output_data_ch;
-        out_ptr2 = out_ptr1 + output_w;
-        out_ptr1_c2 = output_data_ch_2;
-        out_ptr2_c2 = out_ptr1_c2 + output_w;
-
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-        in_ptr4 = in_ptr3 + input_w;
-
-        int o_h = 0;
-        for (; o_h < output_h - 1; o_h = o_h + 2) {
-          if (!if_nopadding &&
-              (o_h < padding_h || o_h > output_h - padding_h - 2)) {
-            issamefilter = false;
-          } else {
-            issamefilter = true;
-          }
-          int o_w = 0;
-          // pad left
-          for (; o_w < padding_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-            float sum1_c2 = 0;
-            float sum2_c2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-              sum2_c2 += in_ptr4[0] * pad_filter3_c2[0];
-              sum2_c2 += in_ptr4[1] * pad_filter3_c2[1];
-              sum2_c2 += in_ptr4[2] * pad_filter3_c2[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-              float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr1[0] * pad_filter0_c2[0];
-              sum2_c2 += in_ptr1[1] * pad_filter0_c2[1];
-              sum2_c2 += in_ptr1[2] * pad_filter0_c2[2];
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr2_c2 += sum2_c2;
-
-            out_ptr1++;
-            out_ptr2++;
-            out_ptr1_c2++;
-            out_ptr2_c2++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]          \n\t"
-                  "prfm   pldl1keep, [%[f1_c2], #256]       \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32       \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1_c2]], #32    \n\t"
-                  "ld1   {v4.s}[0], [%[f1]]                 \n\t"
-
-                  "sub        %[f1],%[f1], #32              \n\t"
-                  "ld1   {v4.s}[1], [%[f1_c2]]              \n\t"
-                  "sub        %[f1_c2],%[f1_c2], #32        \n\t"
-
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "prfm   pldl1keep, [%[in_ptr4], #192]     \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr4]]            \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr4]]              \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr1_c2], #128] \n\t"
-                  "prfm   pldl1keep, [%[out_ptr2], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr2_c2], #128] \n\t"
-
-                  "ld1   {v12.4s}, [%[out_ptr1]]            \n\t"
-                  "ld1   {v13.4s}, [%[out_ptr1_c2]]         \n\t"
-                  "ld1   {v14.4s}, [%[out_ptr2]]            \n\t"
-                  "ld1   {v15.4s}, [%[out_ptr2_c2]]         \n\t"
-
-                  // in_ptr1 and in_ptr4 multiply
-                  "ext    v8.16b, v5.16b, v6.16b, #4        \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[0]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[0]            \n\t"
-
-                  "ext    v9.16b, v6.16b, v7.16b, #8        \n\t"
-                  "fmla   v14.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v15.4s, v7.4s, v4.s[1]            \n\t"
-
-                  "ext    v10.16b, v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[1]            \n\t"
-                  "fmla   v13.4s, v8.4s, v2.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b, v7.16b, #12      \n\t"
-                  "fmla   v14.4s, v9.4s, v1.s[2]            \n\t"
-                  "fmla   v15.4s, v9.4s, v3.s[2]            \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr2]]       \n\t"
-                  "fmla   v12.4s, v10.4s, v0.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v2.s[2]           \n\t"
-
-                  "add        %[in_ptr2],%[in_ptr2], #16    \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v15.4s, v11.4s, v3.s[3]           \n\t"
-
-                  // in_ptr2 multiply
-                  "ext    v8.16b,  v5.16b, v6.16b, #4       \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[3]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[3]            \n\t"
-
-                  "fmla   v14.4s, v5.4s, v0.s[0]            \n\t"
-                  "fmla   v15.4s, v5.4s, v2.s[0]            \n\t"
-
-                  "ext    v9.16b,  v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v1.s[0]            \n\t"
-                  "fmla   v13.4s, v8.4s, v3.s[0]            \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr3]]            \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "fmla   v14.4s, v8.4s, v0.s[1]            \n\t"
-                  "fmla   v15.4s, v8.4s, v2.s[1]            \n\t"
-
-                  "ld1   {v7.4s}, [%[in_ptr3]]              \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-
-                  "fmla   v12.4s, v9.4s, v1.s[1]            \n\t"
-                  "fmla   v13.4s, v9.4s, v3.s[1]            \n\t"
-
-                  "ext    v10.16b, v6.16b, v7.16b, #8       \n\t"
-                  "fmla   v14.4s, v9.4s, v0.s[2]            \n\t"
-                  "fmla   v15.4s, v9.4s, v2.s[2]            \n\t"
-
-                  // in_ptr3 multiply
-                  "fmla   v12.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v13.4s, v7.4s, v4.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b,  v7.16b, #12     \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[1]            \n\t"
-                  "fmla   v15.4s, v7.4s, v3.s[1]            \n\t"
-
-                  "fmla   v12.4s, v10.4s, v1.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v3.s[2]           \n\t"
-
-                  "fmla   v14.4s, v10.4s, v0.s[3]           \n\t"
-                  "fmla   v15.4s, v10.4s, v2.s[3]           \n\t"
-
-                  "fmla   v12.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v13.4s, v11.4s, v3.s[3]           \n\t"
-
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[0]           \n\t"
-                  "fmla   v15.4s, v11.4s, v3.s[0]           \n\t"
-
-                  // store out_ptr
-                  "prfm   pldl1keep, [%[in_ptr4], #192]     \n\t"
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-                  "st1   {v12.4s}, [%[out_ptr1]], #16       \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr4]]            \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "st1   {v13.4s}, [%[out_ptr1_c2]], #16     \n\t"
-
-                  "ld1   {v7.4s}, [%[in_ptr4]]              \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "st1   {v14.4s}, [%[out_ptr2]], #16       \n\t"
-
-                  "subs       %[loop],%[loop], #1   \n\t"
-                  "st1   {v15.4s}, [%[out_ptr2_c2]], #16    \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-                  "sub       %[in_ptr1],%[in_ptr1], #16     \n\t"
-                  "sub       %[in_ptr4],%[in_ptr4], #16     \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            pad_filter0_c2--;
-            pad_filter1_c2--;
-            pad_filter2_c2--;
-            pad_filter3_c2--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#else
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                 \n\t"
-                  "pld        [%[f1_c2], #256]              \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]              \n\t"
-                  "add        %[f1], #32                    \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1_c2]]           \n\t"
-                  "add        %[f1_c2], #32                 \n\t"
-
-                  "vld1.f32   {d8[0]}, [%[f1]]              \n\t"
-                  "sub        %[f1], #32                    \n\t"
-                  "vld1.f32   {d8[1]}, [%[f1_c2]]           \n\t"
-                  "sub        %[f1_c2], #32                 \n\t"
-
-                  "pld        [%[in_ptr1], #192]            \n\t"
-                  "pld        [%[in_ptr4], #192]            \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr4]]       \n\t"
-                  "add        %[in_ptr4], #16               \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]           \n\t"
-                  "pld        [%[out_ptr1_c2], #128]        \n\t"
-                  "pld        [%[out_ptr2], #128]           \n\t"
-                  "pld        [%[out_ptr2_c2], #128]        \n\t"
-
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]     \n\t"
-                  "vld1.f32   {d26, d27}, [%[out_ptr1_c2]]  \n\t"
-                  "vld1.f32   {d28, d29}, [%[out_ptr2]]     \n\t"
-                  "vld1.f32   {d30, d31}, [%[out_ptr2_c2]]  \n\t"
-
-                  // in_ptr1 + in_ptr4 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d0[0]                \n\t"
-                  "vmla.f32   q13, q5, d4[0]                \n\t"
-
-                  "vext.32    q9, q6, q7, #2                \n\t"
-                  "vmla.f32   q14, q7, d8[0]                \n\t"
-                  "vmla.f32   q15, q7, d8[1]                \n\t"
-
-                  "vext.32    q10, q5, q6, #2               \n\t"
-                  "vmla.f32   q12, q8, d0[1]                \n\t"
-                  "vmla.f32   q13, q8, d4[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q14, q9, d3[0]                \n\t"
-                  "vmla.f32   q15, q9, d7[0]                \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2], #16               \n\t"
-                  "vmla.f32   q12, q10, d1[0]               \n\t"
-                  "vmla.f32   q13, q10, d5[0]               \n\t"
-
-                  "vmla.f32   q14, q11, d3[1]               \n\t"
-                  "vmla.f32   q15, q11, d7[1]               \n\t"
-
-                  // in_ptr2 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d1[1]                \n\t"
-                  "vmla.f32   q13, q5, d5[1]                \n\t"
-
-                  "vmla.f32   q14, q5, d0[0]                \n\t"
-                  "vmla.f32   q15, q5, d4[0]                \n\t"
-
-                  "vext.32    q9, q5, q6, #2                \n\t"
-                  "vmla.f32   q12, q8, d2[0]                \n\t"
-                  "vmla.f32   q13, q8, d6[0]                \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr3]]       \n\t"
-                  "add        %[in_ptr3], #16               \n\t"
-                  "vmla.f32   q14, q8, d0[1]                \n\t"
-                  "vmla.f32   q15, q8, d4[1]                \n\t"
-
-                  "vmla.f32   q12, q9, d2[1]                \n\t"
-                  "vmla.f32   q13, q9, d6[1]                \n\t"
-
-                  "vmla.f32   q14, q9, d1[0]                \n\t"
-                  "vmla.f32   q15, q9, d5[0]                \n\t"
-
-                  // in_ptr3 multiply
-                  "vext.32    q10, q6, q7, #2               \n\t"
-                  "vmla.f32   q12, q7, d8[0]                \n\t"
-                  "vmla.f32   q13, q7, d8[1]                \n\t"
-                  "vmla.f32   q14, q7, d2[1]                \n\t"
-                  "vmla.f32   q15, q7, d6[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q12, q10, d3[0]               \n\t"
-                  "vmla.f32   q13, q10, d7[0]               \n\t"
-                  "vmla.f32   q14, q10, d1[1]               \n\t"
-                  "vmla.f32   q15, q10, d5[1]               \n\t"
-
-                  "vmla.f32   q12, q11, d3[1]               \n\t"
-                  "vmla.f32   q13, q11, d7[1]               \n\t"
-                  "vmla.f32   q14, q11, d2[0]               \n\t"
-                  "vmla.f32   q15, q11, d6[0]               \n\t"
-
-                  // store out_ptr
-                  "pld        [%[in_ptr1], #192]            \n\t"
-
-                  "pld        [%[in_ptr4], #192]            \n\t"
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!    \n\t"
-
-                  "vst1.f32   {d26, d27}, [%[out_ptr1_c2]]! \n\t"
-                  "vld1.f32   {d13-d15}, [%[in_ptr4]]       \n\t"
-
-                  "add        %[in_ptr4], #16               \n\t"
-                  "vst1.f32   {d28, d29}, [%[out_ptr2]]!    \n\t"
-
-                  "subs       %[loop], #1               \n\t"
-                  "vst1.f32   {d30, d31}, [%[out_ptr2_c2]]! \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-                  "sub       %[in_ptr1], #16                \n\t"
-                  "sub       %[in_ptr4], #16                \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            pad_filter0_c2--;
-            pad_filter1_c2--;
-            pad_filter2_c2--;
-            pad_filter3_c2--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-            float sum1_c2 = 0;
-            float sum2_c2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr4, _pad_filter3_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-              sum2_c2 += in_ptr4[0] * pad_filter3_c2[0];
-              sum2_c2 += in_ptr4[1] * pad_filter3_c2[1];
-              sum2_c2 += in_ptr4[2] * pad_filter3_c2[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-              float32x4_t _pad_filter0_c2 = vld1q_f32(pad_filter0_c2);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _sum2_c2 = vmulq_f32(_in_ptr1, _pad_filter0_c2);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr2, _pad_filter1_c2);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum2_c2 = vmlaq_f32(_sum2_c2, _in_ptr3, _pad_filter2_c2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-              _sum2_c2 = vsetq_lane_f32(sum2_c2, _sum2_c2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss1_2 =
-                  vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ss2_2 =
-                  vadd_f32(vget_low_f32(_sum2_c2), vget_high_f32(_sum2_c2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-              float32x2_t _ssss1_2_ssss2_2 = vpadd_f32(_ss1_2, _ss2_2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum1_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-              sum2_c2 += vget_lane_f32(_ssss1_2_ssss2_2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-
-              sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-              sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-              sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-              sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-              sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-              sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-              sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-              sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-              sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-              sum2_c2 += in_ptr1[0] * pad_filter0_c2[0];
-              sum2_c2 += in_ptr1[1] * pad_filter0_c2[1];
-              sum2_c2 += in_ptr1[2] * pad_filter0_c2[2];
-              sum2_c2 += in_ptr2[0] * pad_filter1_c2[0];
-              sum2_c2 += in_ptr2[1] * pad_filter1_c2[1];
-              sum2_c2 += in_ptr2[2] * pad_filter1_c2[2];
-              sum2_c2 += in_ptr3[0] * pad_filter2_c2[0];
-              sum2_c2 += in_ptr3[1] * pad_filter2_c2[1];
-              sum2_c2 += in_ptr3[2] * pad_filter2_c2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr2_c2 += sum2_c2;
-
-            out_ptr1++;
-            out_ptr2++;
-            out_ptr1_c2++;
-            out_ptr2_c2++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += 2 + input_w;
-            in_ptr2 += 2 + input_w;
-            in_ptr3 += 2 + input_w;
-            in_ptr4 += 2 + input_w;
-          } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-            in_ptr4 += 3;
-
-            pad_filter0 -= 2;
-            pad_filter1 -= 2;
-            pad_filter2 -= 2;
-            pad_filter3 -= 2;
-
-            pad_filter0_c2 -= 2;
-            pad_filter1_c2 -= 2;
-            pad_filter2_c2 -= 2;
-            pad_filter3_c2 -= 2;
-
-          } else if (issamefilter) {
-            in_ptr1 += 3 + input_w;
-            in_ptr2 += 3 + input_w;
-            in_ptr3 += 3 + input_w;
-            in_ptr4 += 3 + input_w;
-
-            pad_filter0 += 2 * padding_w + 1;
-            pad_filter1 += 2 * padding_w + 1;
-            pad_filter2 += 2 * padding_w + 1;
-            pad_filter3 += 2 * padding_w + 1;
-
-            pad_filter0_c2 += 2 * padding_w + 1;
-            pad_filter1_c2 += 2 * padding_w + 1;
-            pad_filter2_c2 += 2 * padding_w + 1;
-            pad_filter3_c2 += 2 * padding_w + 1;
-
-          } else {
-            pad_filter0 -= 3 + 2 * padding_w + 2;
-            pad_filter1 -= 3 + 2 * padding_w + 2;
-            pad_filter2 -= 3 + 2 * padding_w + 2;
-            pad_filter3 -= 3 + 2 * padding_w + 2;
-
-            pad_filter0_c2 -= 3 + 2 * padding_w + 2;
-            pad_filter1_c2 -= 3 + 2 * padding_w + 2;
-            pad_filter2_c2 -= 3 + 2 * padding_w + 2;
-            pad_filter3_c2 -= 3 + 2 * padding_w + 2;
-
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-            in_ptr4 -= input_w - 3;
-          }
-          out_ptr1 += output_w;
-          out_ptr2 += output_w;
-          out_ptr1_c2 += output_w;
-          out_ptr2_c2 += output_w;
-        }
-        // remain output_height
-        for (; o_h < output_h; ++o_h) {
-          int o_w = 0;
-          // pad left
-          for (; o_w < padding_w; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-#endif
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-          }
-//             valid
-#if __ARM_NEON
-#if __aarch64__
-          if (if_nopadding) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]          \n\t"
-                  "prfm   pldl1keep, [%[f1_c2], #256]        \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]]            \n\t"
-                  "add        %[f1], %[f1], #32             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1_c2]]          \n\t"
-                  "add        %[f1_c2], %[f1_c2], #32         \n\t"
-
-                  "ld1   {v4.s}[0], [%[f1]]                 \n\t"
-                  "sub        %[f1],%[f1], #32              \n\t"
-                  "ld1   {v4.s}[1], [%[f1_c2]]               \n\t"
-                  "sub        %[f1_c2],%[f1_c2], #32          \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr1_c2], #128]  \n\t"
-
-                  "ld1   {v12.4s}, [%[out_ptr1]]            \n\t"
-                  "ld1   {v13.4s}, [%[out_ptr1_c2]]          \n\t"
-
-                  // in_ptr1 multiply
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-
-                  "ext    v8.16b, v5.16b, v6.16b, #4        \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[0]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[0]            \n\t"
-
-                  "ext    v10.16b, v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[1]            \n\t"
-                  "fmla   v13.4s, v8.4s, v2.s[1]            \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2],%[in_ptr2], #16    \n\t"
-                  "fmla   v12.4s, v10.4s, v0.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v2.s[2]           \n\t"
-
-                  // in_ptr2 multiply
-                  "ext    v8.16b,  v5.16b, v6.16b, #4       \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[3]            \n\t"
-                  "fmla   v13.4s, v5.4s, v2.s[3]            \n\t"
-
-                  "ext    v9.16b,  v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v1.s[0]            \n\t"
-                  "fmla   v13.4s, v8.4s, v3.s[0]            \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr3]]            \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr3]]              \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-
-                  "fmla   v12.4s, v9.4s, v1.s[1]            \n\t"
-                  "fmla   v13.4s, v9.4s, v3.s[1]            \n\t"
-
-                  // in_ptr3 multiply
-                  "ext    v10.16b, v6.16b, v7.16b, #8       \n\t"
-                  "fmla   v12.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v13.4s, v7.4s, v4.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b,  v7.16b, #12     \n\t"
-                  "fmla   v12.4s, v10.4s, v1.s[2]           \n\t"
-                  "fmla   v13.4s, v10.4s, v3.s[2]           \n\t"
-
-                  "fmla   v12.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v13.4s, v11.4s, v3.s[3]           \n\t"
-
-                  // store out_ptr
-                  "st1   {v12.4s}, [%[out_ptr1]], #16       \n\t"
-                  "st1   {v13.4s}, [%[out_ptr1_c2]], #16     \n\t"
-
-                  // cycle
-                  "subs       %[loop],%[loop], #1   \n\t"
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13");
-            }
-          }
-#else
-          if (if_nopadding) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                 \n\t"
-                  "pld        [%[f1_c2], #256]               \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]              \n\t"
-                  "add        %[f1], #32                    \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1_c2]]            \n\t"
-                  "add        %[f1_c2], #32                  \n\t"
-
-                  "vld1.f32   {d8[0]}, [%[f1]]              \n\t"
-                  "sub        %[f1], #32                    \n\t"
-                  "vld1.f32   {d8[1]}, [%[f1_c2]]            \n\t"
-                  "sub        %[f1_c2], #32                  \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]           \n\t"
-                  "pld        [%[out_ptr1_c2], #128]         \n\t"
-
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]     \n\t"
-                  "vld1.f32   {d26, d27}, [%[out_ptr1_c2]]   \n\t"
-
-                  // in_ptr1 multiply
-                  "pld        [%[in_ptr1], #128]            \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-                  "vext.32    q8, q5, q6, #1                \n\t"
-
-                  "pld        [%[in_ptr2], #128]            \n\t"
-                  "vmla.f32   q12, q5, d0[0]                \n\t"
-                  "vmla.f32   q13, q5, d4[0]                \n\t"
-
-                  "vext.32    q10, q5, q6, #2               \n\t"
-                  "vld1.f32   {d10-d12}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2], #16               \n\t"
-                  "vmla.f32   q12, q8, d0[1]                \n\t"
-                  "vmla.f32   q13, q8, d4[1]                \n\t"
-
-                  "vmla.f32   q12, q10, d1[0]               \n\t"
-                  "vmla.f32   q13, q10, d5[0]               \n\t"
-
-                  // in_ptr2 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "pld        [%[in_ptr3], #128]            \n\t"
-                  "vmla.f32   q12, q5, d1[1]                \n\t"
-                  "vmla.f32   q13, q5, d5[1]                \n\t"
-
-                  "vext.32    q9, q5, q6, #2                \n\t"
-                  "vld1.f32   {d13-d15}, [%[in_ptr3]]       \n\t"
-                  "add        %[in_ptr3], #16               \n\t"
-                  "vmla.f32   q12, q8, d2[0]                \n\t"
-                  "vmla.f32   q13, q8, d6[0]                \n\t"
-
-                  "vmla.f32   q12, q9, d2[1]                \n\t"
-                  "vmla.f32   q13, q9, d6[1]                \n\t"
-
-                  // in_ptr3 multiply
-                  "vext.32    q10, q6, q7, #2               \n\t"
-                  "vmla.f32   q12, q7, d8[0]                \n\t"
-                  "vmla.f32   q13, q7, d8[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q12, q10, d3[0]               \n\t"
-                  "vmla.f32   q13, q10, d7[0]               \n\t"
-
-                  "vmla.f32   q12, q11, d3[1]               \n\t"
-                  "vmla.f32   q13, q11, d7[1]               \n\t"
-
-                  // store out_ptr
-                  "subs       %[loop], #1               \n\t"
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!    \n\t"
-                  "vst1.f32   {d26, d27}, [%[out_ptr1_c2]]!  \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr2_c2] "+r"(out_ptr2_c2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1), [f1_c2] "r"(f1_c2)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q9", "q10", "q11", "q12", "q13");
-            }
-          }
-
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-#endif
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              pad_filter0_c2--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-          }
-          out_ptr1 += output_w;
-          out_ptr1_c2 += output_w;
-        }
-        filter_data_ch += filter_ch_size;
-        filter_data_ch_c2 += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-
-    int out_ch_remain_start = output_ch - output_ch % 2;
-    // remain output_channel
-    for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) {
-      bool issamefilter;
-      const float *in_ptr1, *in_ptr2, *in_ptr3, *in_ptr4;
-      const float *f1;
-      const float *pad_filter0, *pad_filter1, *pad_filter2, *pad_filter3;
-      float pad_filter_arr[pad_filter_ch_size];
-      float *output_data_ch;
-      const float *input_data_ch;
-      const float *filter_data_ch;
-
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        f1 = filter_data_ch;
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          for (int i = 0; i < 9; ++i) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-          }
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter0 = pad_filter1 - pad_filter_w;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-        }
-        float *out_ptr1, *out_ptr2;
-        out_ptr1 = output_data_ch;
-        out_ptr2 = out_ptr1 + output_w;
-
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-        in_ptr4 = in_ptr3 + input_w;
-
-        int o_h = 0;
-        for (; o_h < output_h - 1; o_h = o_h + 2) {
-          if (!if_nopadding &&
-              (o_h < padding_h || o_h > output_h - padding_h - 2)) {
-            issamefilter = false;
-          } else {
-            issamefilter = true;
-          }
-          int o_w = 0;
-          // pad left
-          for (; o_w < padding_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-
-            out_ptr1++;
-            out_ptr2++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]          \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]]            \n\t"
-                  "add        %[f1], %[f1], #32             \n\t"
-
-                  "ld1   {v4.s}[0], [%[f1]]                 \n\t"
-                  "sub        %[f1],%[f1], #32              \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]    \n\t"
-                  "prfm   pldl1keep, [%[out_ptr2], #128]    \n\t"
-
-                  "ld1   {v12.4s}, [%[out_ptr1]]            \n\t"
-                  "ld1   {v14.4s}, [%[out_ptr2]]            \n\t"
-
-                  // in_ptr1 + in_ptr4 multiply
-                  "prfm   pldl1keep, [%[in_ptr1], #192]     \n\t"
-                  "prfm   pldl1keep, [%[in_ptr4], #192]     \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1],%[in_ptr1], #16    \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr4]]            \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr4]]              \n\t"
-                  "add        %[in_ptr4],%[in_ptr4], #8     \n\t"
-
-                  "ext    v8.16b, v5.16b, v6.16b, #4        \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[0]            \n\t"
-
-                  "ext    v9.16b, v6.16b, v7.16b, #8        \n\t"
-                  "fmla   v14.4s, v7.4s, v4.s[0]            \n\t"
-
-                  "ext    v10.16b, v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b, v7.16b, #12      \n\t"
-                  "fmla   v14.4s, v9.4s, v1.s[2]            \n\t"
-
-                  "ld1   {v5.4s, v6.4s}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2],%[in_ptr2], #16    \n\t"
-
-                  "fmla   v12.4s, v10.4s, v0.s[2]           \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[3]           \n\t"
-
-                  // in_ptr2 multiply
-                  "ext    v8.16b,  v5.16b, v6.16b, #4       \n\t"
-                  "fmla   v12.4s, v5.4s, v0.s[3]            \n\t"
-                  "fmla   v14.4s, v5.4s, v0.s[0]            \n\t"
-
-                  "ext    v9.16b,  v5.16b, v6.16b, #8       \n\t"
-                  "fmla   v12.4s, v8.4s, v1.s[0]            \n\t"
-                  "fmla   v14.4s, v8.4s, v0.s[1]            \n\t"
-
-                  "ld1   {v6.d}[1], [%[in_ptr3]]            \n\t"
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "ld1   {v7.4s}, [%[in_ptr3]]              \n\t"
-
-                  "add        %[in_ptr3],%[in_ptr3], #8     \n\t"
-                  "fmla   v12.4s, v9.4s, v1.s[1]            \n\t"
-                  "fmla   v14.4s, v9.4s, v0.s[2]            \n\t"
-
-                  // in_ptr3 multiply
-                  "ext    v10.16b, v6.16b, v7.16b, #8       \n\t"
-                  "fmla   v12.4s, v7.4s, v4.s[0]            \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[1]            \n\t"
-
-                  "ext    v11.16b, v6.16b,  v7.16b, #12     \n\t"
-                  "fmla   v12.4s, v10.4s, v1.s[2]           \n\t"
-                  "fmla   v14.4s, v10.4s, v0.s[3]           \n\t"
-
-                  "fmla   v12.4s, v11.4s, v1.s[3]           \n\t"
-                  "fmla   v14.4s, v11.4s, v1.s[0]           \n\t"
-
-                  // store out_ptr
-                  "st1   {v12.4s}, [%[out_ptr1]], #16       \n\t"
-                  "st1   {v14.4s}, [%[out_ptr2]], #16       \n\t"
-
-                  // cycle
-                  "subs       %[loop],%[loop], #1   \n\t"
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1)
-                  : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8",
-                    "v9", "v10", "v11", "v12", "v14");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#else
-          if (issamefilter) {
-            int loop = (output_w - 2 * padding_w) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                 \n\t"
-                  "vld1.f32   {d0-d3}, [%[f1]]              \n\t"
-                  "add        %[f1], #32                    \n\t"
-
-                  "vld1.f32   {d8[0]}, [%[f1]]              \n\t"
-                  "sub        %[f1], #32                    \n\t"
-
-                  "0:                                       \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]           \n\t"
-                  "pld        [%[out_ptr2], #128]           \n\t"
-
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]     \n\t"
-                  "vld1.f32   {d28, d29}, [%[out_ptr2]]     \n\t"
-
-                  // in_ptr1 + in_ptr4 multiply
-                  "pld        [%[in_ptr1], #192]            \n\t"
-                  "pld        [%[in_ptr4], #192]            \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr1]]       \n\t"
-                  "add        %[in_ptr1], #16               \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr4]]       \n\t"
-                  "add        %[in_ptr4], #16               \n\t"
-
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d0[0]                \n\t"
-
-                  "vext.32    q9, q6, q7, #2                \n\t"
-                  "vmla.f32   q14, q7, d8[0]                \n\t"
-
-                  "vext.32    q10, q5, q6, #2               \n\t"
-                  "vmla.f32   q12, q8, d0[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q14, q9, d3[0]                \n\t"
-
-                  "vld1.f32   {d10-d12}, [%[in_ptr2]]       \n\t"
-                  "add        %[in_ptr2], #16               \n\t"
-
-                  "vmla.f32   q12, q10, d1[0]               \n\t"
-                  "vmla.f32   q14, q11, d3[1]               \n\t"
-
-                  // in_ptr2 multiply
-                  "vext.32    q8, q5, q6, #1                \n\t"
-                  "vmla.f32   q12, q5, d1[1]                \n\t"
-                  "vmla.f32   q14, q5, d0[0]                \n\t"
-
-                  "vext.32    q9, q5, q6, #2                \n\t"
-                  "vmla.f32   q12, q8, d2[0]                \n\t"
-                  "vmla.f32   q14, q8, d0[1]                \n\t"
-
-                  "vld1.f32   {d13-d15}, [%[in_ptr3]]       \n\t"
-                  "add        %[in_ptr3], #16               \n\t"
-
-                  "vmla.f32   q12, q9, d2[1]                \n\t"
-                  "vmla.f32   q14, q9, d1[0]                \n\t"
-
-                  // in_ptr3 multiply
-                  "vext.32    q10, q6, q7, #2               \n\t"
-                  "vmla.f32   q12, q7, d8[0]                \n\t"
-                  "vmla.f32   q14, q7, d2[1]                \n\t"
-
-                  "vext.32    q11, q6, q7, #3               \n\t"
-                  "vmla.f32   q12, q10, d3[0]               \n\t"
-                  "vmla.f32   q14, q10, d1[1]               \n\t"
-
-                  "vmla.f32   q12, q11, d3[1]               \n\t"
-                  "vmla.f32   q14, q11, d2[0]               \n\t"
-
-                  // store out_ptr
-                  "subs       %[loop], #1               \n\t"
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!    \n\t"
-                  "vst1.f32   {d28, d29}, [%[out_ptr2]]!    \n\t"
-
-                  // cycle
-                  "bne        0b                            \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr2] "+r"(out_ptr2), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3),
-                    [in_ptr4] "+r"(in_ptr4)
-                  : [f1] "r"(f1)
-                  : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8",
-                    "q9", "q10", "q11", "q12", "q14");
-            }
-          }
-          if (!if_nopadding && o_w == output_w - padding_w) {
-            pad_filter0--;
-            pad_filter1--;
-            pad_filter2--;
-            pad_filter3--;
-
-            in_ptr1--;
-            in_ptr2--;
-            in_ptr3--;
-            in_ptr4--;
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum2 = 0;
-
-            if (issamefilter) {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr2, _pad_filter1);
-
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-
-              float32x4_t _in_ptr4 = vld1q_f32(in_ptr4);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr4, _pad_filter3);
-
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-              sum2 += in_ptr4[0] * pad_filter3[0];
-              sum2 += in_ptr4[1] * pad_filter3[1];
-              sum2 += in_ptr4[2] * pad_filter3[2];
-#endif
-            } else {
-#if __ARM_NEON
-              float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-              float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-              float32x4_t _pad_filter0 = vld1q_f32(pad_filter0);
-
-              float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-              float32x4_t _sum2 = vmulq_f32(_in_ptr1, _pad_filter0);
-              float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-              float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr2, _pad_filter1);
-              float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-              float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-
-              _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-              _sum2 = vmlaq_f32(_sum2, _in_ptr3, _pad_filter2);
-              _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-              _sum2 = vsetq_lane_f32(sum2, _sum2, 3);
-
-              float32x2_t _ss1 =
-                  vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-              float32x2_t _ss2 =
-                  vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
-              float32x2_t _ssss1_ssss2 = vpadd_f32(_ss1, _ss2);
-
-              sum1 += vget_lane_f32(_ssss1_ssss2, 0);
-              sum2 += vget_lane_f32(_ssss1_ssss2, 1);
-#else
-              sum1 += in_ptr1[0] * pad_filter1[0];
-              sum1 += in_ptr1[1] * pad_filter1[1];
-              sum1 += in_ptr1[2] * pad_filter1[2];
-              sum1 += in_ptr2[0] * pad_filter2[0];
-              sum1 += in_ptr2[1] * pad_filter2[1];
-              sum1 += in_ptr2[2] * pad_filter2[2];
-              sum1 += in_ptr3[0] * pad_filter3[0];
-              sum1 += in_ptr3[1] * pad_filter3[1];
-              sum1 += in_ptr3[2] * pad_filter3[2];
-
-              sum2 += in_ptr1[0] * pad_filter0[0];
-              sum2 += in_ptr1[1] * pad_filter0[1];
-              sum2 += in_ptr1[2] * pad_filter0[2];
-              sum2 += in_ptr2[0] * pad_filter1[0];
-              sum2 += in_ptr2[1] * pad_filter1[1];
-              sum2 += in_ptr2[2] * pad_filter1[2];
-              sum2 += in_ptr3[0] * pad_filter2[0];
-              sum2 += in_ptr3[1] * pad_filter2[1];
-              sum2 += in_ptr3[2] * pad_filter2[2];
-#endif
-            }
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr2 += sum2;
-
-            out_ptr1++;
-            out_ptr2++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += 2 + input_w;
-            in_ptr2 += 2 + input_w;
-            in_ptr3 += 2 + input_w;
-            in_ptr4 += 2 + input_w;
-          } else if (o_h == padding_h - 1 || o_h == output_h - padding_h - 2) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-            in_ptr4 += 3;
-
-            pad_filter0 -= 2;
-            pad_filter1 -= 2;
-            pad_filter2 -= 2;
-            pad_filter3 -= 2;
-
-          } else if (issamefilter) {
-            in_ptr1 += 3 + input_w;
-            in_ptr2 += 3 + input_w;
-            in_ptr3 += 3 + input_w;
-            in_ptr4 += 3 + input_w;
-
-            pad_filter0 += 2 * padding_w + 1;
-            pad_filter1 += 2 * padding_w + 1;
-            pad_filter2 += 2 * padding_w + 1;
-            pad_filter3 += 2 * padding_w + 1;
-
-          } else {
-            pad_filter0 -= 3 + 2 * padding_w + 2;
-            pad_filter1 -= 3 + 2 * padding_w + 2;
-            pad_filter2 -= 3 + 2 * padding_w + 2;
-            pad_filter3 -= 3 + 2 * padding_w + 2;
-
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-            in_ptr4 -= input_w - 3;
-          }
-          out_ptr1 += output_w;
-          out_ptr2 += output_w;
-        }
-
-        // remain output_height
-        for (; o_h < output_h; ++o_h) {
-          for (int o_w = 0; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1);
-            sum1 += vget_lane_f32(_ssss1_ssss1, 0);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-#endif
-            if (!if_nopadding &&
-                (o_w < padding_w || o_w > output_w - padding_w - 2)) {
-              pad_filter0--;
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-            } else {
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-              in_ptr4++;
-            }
-            *out_ptr1 += sum1;
-            out_ptr1++;
-          }
-          out_ptr1 += output_w;
-        }
-        filter_data_ch += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-    input_data += in_batch_size;
-    output_data += out_batch_size;
-  }
-}
-
-template <>
-void SlidingwindowConv3x3s2<float, float>(const framework::Tensor *input,
-                                          const framework::Tensor *filter,
-                                          const std::vector<int> &paddings,
-                                          framework::Tensor *output) {
-  const int batch = input->dims()[0];
-  const int input_ch = input->dims()[1];
-  const int input_h = input->dims()[2];
-  const int input_w = input->dims()[3];
-  const int output_ch = output->dims()[1];
-  const int output_h = output->dims()[2];
-  const int output_w = output->dims()[3];
-  const int padding_h = paddings[0];
-  const int padding_w = paddings[1];
-
-  const float *input_data = input->data<float>();
-  float *output_data = output->mutable_data<float>();
-  const float *filter_data = filter->data<float>();
-
-  const int in_ch_size = input_h * input_w;
-  const int in_batch_size = input_ch * in_ch_size;
-  const int out_ch_size = output_h * output_w;
-  const int out_batch_size = output_ch * out_ch_size;
-  const int out_size = batch * out_batch_size;
-  const int filter_ch_size = 9;
-  const int pad_filter_ch_size = (2 * padding_h + 3) * (2 * padding_w + 3);
-  const int pad_filter_start =
-      2 * padding_h * (2 * padding_w + 3) + 2 * padding_w;
-  const int pad_filter_w = 3 + padding_w * 2;
-
-  bool if_nopadding = false;
-  const bool if_exact_in_w = (input_w + 2 * padding_w - 3) % 2 == 0;
-  const bool if_exact_in_h = (input_h + 2 * padding_h - 3) % 2 == 0;
-  const bool if_odd_pad_w = padding_w % 2 == 1;
-  const bool if_odd_pad_h = padding_h % 2 == 1;
-
-  int valid_w_start = padding_w >> 1;
-  int valid_h_start = padding_h >> 1;
-  int valid_w_end = output_w - valid_w_start - 2;
-  int valid_h_end = output_h - valid_h_start - 2;
-  const int remain_stride_w = input_w + 2 * padding_w - 2 * output_w;
-#if __ARM_NEON
-  float *out_ptr = output_data;
-  int remain = out_size & 0x3;
-  float32x4_t _zero = vdupq_n_f32(0.0);
-
-  for (int i = 0; i < out_size; i += 4) {
-    vst1q_f32(out_ptr, _zero);
-    out_ptr += 4;
-  }
-  switch (remain) {
-    case 1:
-      vst1q_lane_f32(out_ptr, _zero, 0);
-      break;
-    case 2:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      break;
-    case 3:
-      vst1_f32(out_ptr, vget_low_f32(_zero));
-      vst1q_lane_f32(out_ptr + 2, _zero, 0);
-      break;
-  }
-#else
-#pragma omp parallel for
-  for (int i = 0; i < out_size; ++i) {
-    output_data[i] = 0;
-  }
-#endif
-
-  if (padding_h == 0 && padding_w == 0) {
-    if_nopadding = true;
-    valid_w_start = -1;
-    valid_h_start = -1;
-    valid_w_end = output_w;
-    valid_h_end = output_h;
-  }
-
-  for (int b = 0; b < batch; ++b) {
-#pragma omp parallel for
-    for (int o_c = 0; o_c < output_ch - 7; o_c += 8) {
-      const float *f1;
-      const float *in_ptr1, *in_ptr2, *in_ptr3;
-      const float *pad_filter1, *pad_filter2, *pad_filter3;
-      const float *pad_filter1_c2, *pad_filter2_c2, *pad_filter3_c2;
-      const float *pad_filter1_c3, *pad_filter2_c3, *pad_filter3_c3;
-      const float *pad_filter1_c4, *pad_filter2_c4, *pad_filter3_c4;
-      const float *pad_filter1_c5, *pad_filter2_c5, *pad_filter3_c5;
-      const float *pad_filter1_c6, *pad_filter2_c6, *pad_filter3_c6;
-      const float *pad_filter1_c7, *pad_filter2_c7, *pad_filter3_c7;
-      const float *pad_filter1_c8, *pad_filter2_c8, *pad_filter3_c8;
-
-      float reform_filter_arr[72];
-      float pad_filter_arr[pad_filter_ch_size];
-      float pad_filter_arr_c2[pad_filter_ch_size];
-      float pad_filter_arr_c3[pad_filter_ch_size];
-      float pad_filter_arr_c4[pad_filter_ch_size];
-      float pad_filter_arr_c5[pad_filter_ch_size];
-      float pad_filter_arr_c6[pad_filter_ch_size];
-      float pad_filter_arr_c7[pad_filter_ch_size];
-      float pad_filter_arr_c8[pad_filter_ch_size];
-
-      float *output_data_ch;
-      float *output_data_ch_2;
-      float *output_data_ch_3;
-      float *output_data_ch_4;
-      float *output_data_ch_5;
-      float *output_data_ch_6;
-      float *output_data_ch_7;
-      float *output_data_ch_8;
-
-      const float *input_data_ch;
-      const float *filter_data_ch;
-      const float *filter_data_ch_c2;
-      const float *filter_data_ch_c3;
-      const float *filter_data_ch_c4;
-      const float *filter_data_ch_c5;
-      const float *filter_data_ch_c6;
-      const float *filter_data_ch_c7;
-      const float *filter_data_ch_c8;
-
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-      filter_data_ch_c2 = filter_data + (o_c + 1) * filter_ch_size * input_ch;
-      filter_data_ch_c3 = filter_data + (o_c + 2) * filter_ch_size * input_ch;
-      filter_data_ch_c4 = filter_data + (o_c + 3) * filter_ch_size * input_ch;
-      filter_data_ch_c5 = filter_data + (o_c + 4) * filter_ch_size * input_ch;
-      filter_data_ch_c6 = filter_data + (o_c + 5) * filter_ch_size * input_ch;
-      filter_data_ch_c7 = filter_data + (o_c + 6) * filter_ch_size * input_ch;
-      filter_data_ch_c8 = filter_data + (o_c + 7) * filter_ch_size * input_ch;
-
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-      output_data_ch_2 = output_data + (o_c + 1) * out_ch_size;
-      output_data_ch_3 = output_data + (o_c + 2) * out_ch_size;
-      output_data_ch_4 = output_data + (o_c + 3) * out_ch_size;
-      output_data_ch_5 = output_data + (o_c + 4) * out_ch_size;
-      output_data_ch_6 = output_data + (o_c + 5) * out_ch_size;
-      output_data_ch_7 = output_data + (o_c + 6) * out_ch_size;
-      output_data_ch_8 = output_data + (o_c + 7) * out_ch_size;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        int k = 0;
-        for (int i = 0; i < 9; ++i) {
-          for (int j = 0; j < 8; ++j) {
-            reform_filter_arr[k++] = filter_data_ch[i + input_ch * 9 * j];
-          }
-        }
-
-        f1 = reform_filter_arr;
-
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          memset(pad_filter_arr_c2, 0.f, sizeof(pad_filter_arr_c2));
-          memset(pad_filter_arr_c3, 0.f, sizeof(pad_filter_arr_c3));
-          memset(pad_filter_arr_c4, 0.f, sizeof(pad_filter_arr_c4));
-          memset(pad_filter_arr_c5, 0.f, sizeof(pad_filter_arr_c5));
-          memset(pad_filter_arr_c6, 0.f, sizeof(pad_filter_arr_c6));
-          memset(pad_filter_arr_c7, 0.f, sizeof(pad_filter_arr_c7));
-          memset(pad_filter_arr_c8, 0.f, sizeof(pad_filter_arr_c8));
-
-          for (int i = 0; i < 9; ++i) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-            pad_filter_arr_c2[j] = filter_data_ch_c2[i];
-            pad_filter_arr_c3[j] = filter_data_ch_c3[i];
-            pad_filter_arr_c4[j] = filter_data_ch_c4[i];
-            pad_filter_arr_c5[j] = filter_data_ch_c5[i];
-            pad_filter_arr_c6[j] = filter_data_ch_c6[i];
-            pad_filter_arr_c7[j] = filter_data_ch_c7[i];
-            pad_filter_arr_c8[j] = filter_data_ch_c8[i];
-          }
-
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-
-          pad_filter1_c2 = pad_filter_arr_c2;
-          pad_filter1_c2 += pad_filter_start;
-          pad_filter2_c2 = pad_filter1_c2 + pad_filter_w;
-          pad_filter3_c2 = pad_filter2_c2 + pad_filter_w;
-
-          pad_filter1_c3 = pad_filter_arr_c3;
-          pad_filter1_c3 += pad_filter_start;
-          pad_filter2_c3 = pad_filter1_c3 + pad_filter_w;
-          pad_filter3_c3 = pad_filter2_c3 + pad_filter_w;
-
-          pad_filter1_c4 = pad_filter_arr_c4;
-          pad_filter1_c4 += pad_filter_start;
-          pad_filter2_c4 = pad_filter1_c4 + pad_filter_w;
-          pad_filter3_c4 = pad_filter2_c4 + pad_filter_w;
-
-          pad_filter1_c5 = pad_filter_arr_c5;
-          pad_filter1_c5 += pad_filter_start;
-          pad_filter2_c5 = pad_filter1_c5 + pad_filter_w;
-          pad_filter3_c5 = pad_filter2_c5 + pad_filter_w;
-
-          pad_filter1_c6 = pad_filter_arr_c6;
-          pad_filter1_c6 += pad_filter_start;
-          pad_filter2_c6 = pad_filter1_c6 + pad_filter_w;
-          pad_filter3_c6 = pad_filter2_c6 + pad_filter_w;
-
-          pad_filter1_c7 = pad_filter_arr_c7;
-          pad_filter1_c7 += pad_filter_start;
-          pad_filter2_c7 = pad_filter1_c7 + pad_filter_w;
-          pad_filter3_c7 = pad_filter2_c7 + pad_filter_w;
-
-          pad_filter1_c8 = pad_filter_arr_c8;
-          pad_filter1_c8 += pad_filter_start;
-          pad_filter2_c8 = pad_filter1_c8 + pad_filter_w;
-          pad_filter3_c8 = pad_filter2_c8 + pad_filter_w;
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-
-          pad_filter1_c2 = filter_data_ch_c2;
-          pad_filter2_c2 = pad_filter1_c2 + 3;
-          pad_filter3_c2 = pad_filter2_c2 + 3;
-
-          pad_filter1_c3 = filter_data_ch_c3;
-          pad_filter2_c3 = pad_filter1_c3 + 3;
-          pad_filter3_c3 = pad_filter2_c3 + 3;
-
-          pad_filter1_c4 = filter_data_ch_c4;
-          pad_filter2_c4 = pad_filter1_c4 + 3;
-          pad_filter3_c4 = pad_filter2_c4 + 3;
-
-          pad_filter1_c5 = filter_data_ch_c5;
-          pad_filter2_c5 = pad_filter1_c5 + 3;
-          pad_filter3_c5 = pad_filter2_c5 + 3;
-
-          pad_filter1_c6 = filter_data_ch_c6;
-          pad_filter2_c6 = pad_filter1_c6 + 3;
-          pad_filter3_c6 = pad_filter2_c6 + 3;
-
-          pad_filter1_c7 = filter_data_ch_c7;
-          pad_filter2_c7 = pad_filter1_c7 + 3;
-          pad_filter3_c7 = pad_filter2_c7 + 3;
-
-          pad_filter1_c8 = filter_data_ch_c8;
-          pad_filter2_c8 = pad_filter1_c8 + 3;
-          pad_filter3_c8 = pad_filter2_c8 + 3;
-        }
-        float *out_ptr1;
-        float *out_ptr1_c2;
-        float *out_ptr1_c3;
-        float *out_ptr1_c4;
-        float *out_ptr1_c5;
-        float *out_ptr1_c6;
-        float *out_ptr1_c7;
-        float *out_ptr1_c8;
-
-        out_ptr1 = output_data_ch;
-        out_ptr1_c2 = output_data_ch_2;
-        out_ptr1_c3 = output_data_ch_3;
-        out_ptr1_c4 = output_data_ch_4;
-        out_ptr1_c5 = output_data_ch_5;
-        out_ptr1_c6 = output_data_ch_6;
-        out_ptr1_c7 = output_data_ch_7;
-        out_ptr1_c8 = output_data_ch_8;
-
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-
-        int o_h = 0;
-
-        for (; o_h < output_h; ++o_h) {
-          int o_w = 0;
-
-          // pad left
-          for (; o_w <= valid_w_start; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-            float sum1_c3 = 0;
-            float sum1_c4 = 0;
-            float sum1_c5 = 0;
-            float sum1_c6 = 0;
-            float sum1_c7 = 0;
-            float sum1_c8 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3);
-            float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4);
-            float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5);
-            float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6);
-            float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7);
-            float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8);
-
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-            float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3);
-            float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4);
-            float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5);
-            float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6);
-            float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7);
-            float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3);
-            float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4);
-            float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5);
-            float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6);
-            float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7);
-            float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3);
-            float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4);
-            float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5);
-            float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6);
-            float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7);
-            float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-            _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3);
-            _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3);
-            _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3);
-            _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3);
-            _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3);
-            _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-            float32x2_t _ss1_3 =
-                vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3));
-            float32x2_t _ss1_4 =
-                vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4));
-            float32x2_t _ss1_5 =
-                vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5));
-            float32x2_t _ss1_6 =
-                vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6));
-            float32x2_t _ss1_7 =
-                vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7));
-            float32x2_t _ss1_8 =
-                vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8));
-
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-            float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4);
-            float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6);
-            float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8);
-
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-            sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0);
-            sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1);
-            sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0);
-            sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1);
-            sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0);
-            sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-            sum1_c3 += in_ptr1[0] * pad_filter1_c3[0];
-            sum1_c3 += in_ptr1[1] * pad_filter1_c3[1];
-            sum1_c3 += in_ptr1[2] * pad_filter1_c3[2];
-            sum1_c3 += in_ptr2[0] * pad_filter2_c3[0];
-            sum1_c3 += in_ptr2[1] * pad_filter2_c3[1];
-            sum1_c3 += in_ptr2[2] * pad_filter2_c3[2];
-            sum1_c3 += in_ptr3[0] * pad_filter3_c3[0];
-            sum1_c3 += in_ptr3[1] * pad_filter3_c3[1];
-            sum1_c3 += in_ptr3[2] * pad_filter3_c3[2];
-
-            sum1_c4 += in_ptr1[0] * pad_filter1_c4[0];
-            sum1_c4 += in_ptr1[1] * pad_filter1_c4[1];
-            sum1_c4 += in_ptr1[2] * pad_filter1_c4[2];
-            sum1_c4 += in_ptr2[0] * pad_filter2_c4[0];
-            sum1_c4 += in_ptr2[1] * pad_filter2_c4[1];
-            sum1_c4 += in_ptr2[2] * pad_filter2_c4[2];
-            sum1_c4 += in_ptr3[0] * pad_filter3_c4[0];
-            sum1_c4 += in_ptr3[1] * pad_filter3_c4[1];
-            sum1_c4 += in_ptr3[2] * pad_filter3_c4[2];
-
-            sum1_c5 += in_ptr1[0] * pad_filter1_c5[0];
-            sum1_c5 += in_ptr1[1] * pad_filter1_c5[1];
-            sum1_c5 += in_ptr1[2] * pad_filter1_c5[2];
-            sum1_c5 += in_ptr2[0] * pad_filter2_c5[0];
-            sum1_c5 += in_ptr2[1] * pad_filter2_c5[1];
-            sum1_c5 += in_ptr2[2] * pad_filter2_c5[2];
-            sum1_c5 += in_ptr3[0] * pad_filter3_c5[0];
-            sum1_c5 += in_ptr3[1] * pad_filter3_c5[1];
-            sum1_c5 += in_ptr3[2] * pad_filter3_c5[2];
-
-            sum1_c6 += in_ptr1[0] * pad_filter1_c6[0];
-            sum1_c6 += in_ptr1[1] * pad_filter1_c6[1];
-            sum1_c6 += in_ptr1[2] * pad_filter1_c6[2];
-            sum1_c6 += in_ptr2[0] * pad_filter2_c6[0];
-            sum1_c6 += in_ptr2[1] * pad_filter2_c6[1];
-            sum1_c6 += in_ptr2[2] * pad_filter2_c6[2];
-            sum1_c6 += in_ptr3[0] * pad_filter3_c6[0];
-            sum1_c6 += in_ptr3[1] * pad_filter3_c6[1];
-            sum1_c6 += in_ptr3[2] * pad_filter3_c6[2];
-
-            sum1_c7 += in_ptr1[0] * pad_filter1_c7[0];
-            sum1_c7 += in_ptr1[1] * pad_filter1_c7[1];
-            sum1_c7 += in_ptr1[2] * pad_filter1_c7[2];
-            sum1_c7 += in_ptr2[0] * pad_filter2_c7[0];
-            sum1_c7 += in_ptr2[1] * pad_filter2_c7[1];
-            sum1_c7 += in_ptr2[2] * pad_filter2_c7[2];
-            sum1_c7 += in_ptr3[0] * pad_filter3_c7[0];
-            sum1_c7 += in_ptr3[1] * pad_filter3_c7[1];
-            sum1_c7 += in_ptr3[2] * pad_filter3_c7[2];
-
-            sum1_c8 += in_ptr1[0] * pad_filter1_c8[0];
-            sum1_c8 += in_ptr1[1] * pad_filter1_c8[1];
-            sum1_c8 += in_ptr1[2] * pad_filter1_c8[2];
-            sum1_c8 += in_ptr2[0] * pad_filter2_c8[0];
-            sum1_c8 += in_ptr2[1] * pad_filter2_c8[1];
-            sum1_c8 += in_ptr2[2] * pad_filter2_c8[2];
-            sum1_c8 += in_ptr3[0] * pad_filter3_c8[0];
-            sum1_c8 += in_ptr3[1] * pad_filter3_c8[1];
-            sum1_c8 += in_ptr3[2] * pad_filter3_c8[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-
-              pad_filter1_c3--;
-              pad_filter2_c3--;
-              pad_filter3_c3--;
-              pad_filter1_c4--;
-              pad_filter2_c4--;
-              pad_filter3_c4--;
-
-              pad_filter1_c5--;
-              pad_filter2_c5--;
-              pad_filter3_c5--;
-              pad_filter1_c6--;
-              pad_filter2_c6--;
-              pad_filter3_c6--;
-
-              pad_filter1_c7--;
-              pad_filter2_c7--;
-              pad_filter3_c7--;
-              pad_filter1_c8--;
-              pad_filter2_c8--;
-              pad_filter3_c8--;
-
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-              pad_filter1_c2 -= 2;
-              pad_filter2_c2 -= 2;
-              pad_filter3_c2 -= 2;
-
-              pad_filter1_c3 -= 2;
-              pad_filter2_c3 -= 2;
-              pad_filter3_c3 -= 2;
-              pad_filter1_c4 -= 2;
-              pad_filter2_c4 -= 2;
-              pad_filter3_c4 -= 2;
-
-              pad_filter1_c5 -= 2;
-              pad_filter2_c5 -= 2;
-              pad_filter3_c5 -= 2;
-              pad_filter1_c6 -= 2;
-              pad_filter2_c6 -= 2;
-              pad_filter3_c6 -= 2;
-
-              pad_filter1_c7 -= 2;
-              pad_filter2_c7 -= 2;
-              pad_filter3_c7 -= 2;
-              pad_filter1_c8 -= 2;
-              pad_filter2_c8 -= 2;
-              pad_filter3_c8 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr1_c3 += sum1_c3;
-            *out_ptr1_c4 += sum1_c4;
-            *out_ptr1_c5 += sum1_c5;
-            *out_ptr1_c6 += sum1_c6;
-            *out_ptr1_c7 += sum1_c7;
-            *out_ptr1_c8 += sum1_c8;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-            out_ptr1_c3++;
-            out_ptr1_c4++;
-            out_ptr1_c5++;
-            out_ptr1_c6++;
-            out_ptr1_c7++;
-            out_ptr1_c8++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (o_h > valid_h_start && o_h <= valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "prfm  pldl1keep, [%[in_ptr1], #288]        \n\t"
-
-                  "ld1  {v0.4s, v1.4s}, [%[f1]], #32          \n\t"
-                  "ld2   {v4.4s, v5.4s}, [%[in_ptr1]], #32    \n\t"
-                  "ld2   {v6.4s, v7.4s}, [%[in_ptr1]]         \n\t"
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "prfm  pldl1keep, [%[out_ptr1], #128]       \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c2], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c3], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c4], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c5], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c6], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c7], #128]     \n\t"
-                  "prfm  pldl1keep, [%[out_ptr1_c8], #128]     \n\t"
-
-                  "ld1   {v8.4s}, [%[out_ptr1]]               \n\t"
-                  "ld1   {v9.4s}, [%[out_ptr1_c2]]             \n\t"
-                  "ld1   {v10.4s}, [%[out_ptr1_c3]]            \n\t"
-                  "ld1   {v11.4s}, [%[out_ptr1_c4]]            \n\t"
-                  "ld1   {v12.4s}, [%[out_ptr1_c5]]            \n\t"
-                  "ld1   {v13.4s}, [%[out_ptr1_c6]]            \n\t"
-                  "ld1   {v14.4s}, [%[out_ptr1_c7]]            \n\t"
-                  "ld1   {v15.4s}, [%[out_ptr1_c8]]            \n\t"
-
-                  // in_ptr1 multiply
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1]], #32         \n\t"
-                  "fmla    v8.4s, v4.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v4.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v4.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v4.4s, v0.s[3]              \n\t"
-
-                  "fmla   v12.4s, v4.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v4.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v4.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v4.4s, v1.s[3]              \n\t"
-
-                  "ext    v7.16b, v4.16b, v6.16b, #4          \n\t"
-                  "fmla    v8.4s, v5.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v5.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v5.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v5.4s, v2.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v5.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v5.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v5.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v5.4s, v3.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[in_ptr2], #288]        \n\t"
-                  "ld2    {v4.4s, v5.4s}, [%[in_ptr2]], #32   \n\t"
-                  "fmla    v8.4s, v7.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v7.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v7.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v7.4s, v0.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1]], #32         \n\t"
-
-                  "fmla   v12.4s, v7.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v7.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v7.4s, v1.s[3]              \n\t"
-
-                  // in_ptr2 multiply
-                  "ld2    {v6.4s, v7.4s}, [%[in_ptr2]]        \n\t"
-                  "fmla    v8.4s, v4.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v4.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v4.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v4.4s, v2.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v4.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v4.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v4.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v4.4s, v3.s[3]              \n\t"
-
-                  "ext    v7.16b, v4.16b, v6.16b, #4          \n\t"
-                  "fmla    v8.4s, v5.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v5.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v5.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v5.4s, v0.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1    {v2.4s, v3.4s}, [%[f1]], #32        \n\t"
-                  "fmla   v12.4s, v5.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v5.4s, v1.s[1]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "prfm  pldl1keep, [%[in_ptr3], #288]        \n\t"
-                  "fmla   v14.4s, v5.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v5.4s, v1.s[3]              \n\t"
-
-                  "ld1  {v0.4s, v1.4s}, [%[f1]], #32          \n\t"
-                  "ld2   {v4.4s, v5.4s}, [%[in_ptr3]], #32    \n\t"
-                  "fmla    v8.4s, v7.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v7.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v7.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v7.4s, v2.s[3]              \n\t"
-
-                  "fmla   v12.4s, v7.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v7.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v7.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v7.4s, v3.s[3]              \n\t"
-
-                  // in_ptr3 multiply
-                  "ld2   {v6.4s, v7.4s}, [%[in_ptr3]]         \n\t"
-                  "fmla    v8.4s, v4.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v4.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v4.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v4.4s, v0.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v2.4s, v3.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v4.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v4.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v4.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v4.4s, v1.s[3]              \n\t"
-
-                  "ext    v7.16b, v4.16b, v6.16b, #4          \n\t"
-                  "fmla    v8.4s, v5.4s, v2.s[0]              \n\t"
-                  "fmla    v9.4s, v5.4s, v2.s[1]              \n\t"
-                  "fmla   v10.4s, v5.4s, v2.s[2]              \n\t"
-                  "fmla   v11.4s, v5.4s, v2.s[3]              \n\t"
-
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "ld1   {v0.4s, v1.4s}, [%[f1]], #32         \n\t"
-                  "fmla   v12.4s, v5.4s, v3.s[0]              \n\t"
-                  "fmla   v13.4s, v5.4s, v3.s[1]              \n\t"
-                  "fmla   v14.4s, v5.4s, v3.s[2]              \n\t"
-                  "fmla   v15.4s, v5.4s, v3.s[3]              \n\t"
-
-                  "sub        %[f1], %[f1], #288              \n\t"
-                  "fmla    v8.4s, v7.4s, v0.s[0]              \n\t"
-                  "fmla    v9.4s, v7.4s, v0.s[1]              \n\t"
-                  "fmla   v10.4s, v7.4s, v0.s[2]              \n\t"
-                  "fmla   v11.4s, v7.4s, v0.s[3]              \n\t"
-
-                  "fmla   v12.4s, v7.4s, v1.s[0]              \n\t"
-                  "fmla   v13.4s, v7.4s, v1.s[1]              \n\t"
-                  "fmla   v14.4s, v7.4s, v1.s[2]              \n\t"
-                  "fmla   v15.4s, v7.4s, v1.s[3]              \n\t"
-
-                  // store out_ptr
-                  "prfm  pldl1keep, [%[f1], #256]             \n\t"
-                  "prfm  pldl1keep, [%[in_ptr1], #288]        \n\t"
-
-                  "ld1  {v0.4s, v1.4s}, [%[f1]], #32          \n\t"
-
-                  "ld2   {v4.4s, v5.4s}, [%[in_ptr1]], #32    \n\t"
-                  "st1   {v8.4s}, [%[out_ptr1]], #16          \n\t"
-                  "st1   {v9.4s}, [%[out_ptr1_c2]], #16        \n\t"
-
-                  "st1   {v10.4s}, [%[out_ptr1_c3]], #16       \n\t"
-                  "st1   {v11.4s}, [%[out_ptr1_c4]], #16       \n\t"
-
-                  "st1   {v12.4s}, [%[out_ptr1_c5]], #16       \n\t"
-                  "st1   {v13.4s}, [%[out_ptr1_c6]], #16       \n\t"
-
-                  "ld2   {v6.4s, v7.4s}, [%[in_ptr1]]         \n\t"
-                  "st1   {v14.4s}, [%[out_ptr1_c7]], #16       \n\t"
-                  "subs       %[loop], %[loop], #1    \n\t"
-                  "st1   {v15.4s}, [%[out_ptr1_c8]], #16       \n\t"
-
-                  // cycle
-                  "bne        0b                              \n\t"
-                  "sub       %[f1], %[in_ptr1], #32           \n\t"
-                  "sub       %[in_ptr1], %[in_ptr1], #32      \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr1_c3] "+r"(out_ptr1_c3),
-                    [out_ptr1_c4] "+r"(out_ptr1_c4),
-                    [out_ptr1_c5] "+r"(out_ptr1_c5),
-                    [out_ptr1_c6] "+r"(out_ptr1_c6),
-                    [out_ptr1_c7] "+r"(out_ptr1_c7),
-                    [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1),
-                    [in_ptr2] "+r"(in_ptr2), [in_ptr3] "+r"(in_ptr3)
-                  : [f1] "r"(f1)
-                  : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                    "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
-            }
-          }
-#else
-          if (o_h > valid_h_start && o_h <= valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-            int in_stride = (input_w - 8) * 4;
-
-            if (loop > 0) {
-              asm volatile(
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "pld        [%[in_ptr1], #288]              \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "add        %[in_ptr1], %[in_stride]        \n\t"
-
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]             \n\t"
-                  "pld        [%[out_ptr1_c2], #128]           \n\t"
-                  "pld        [%[out_ptr1_c3], #128]           \n\t"
-                  "pld        [%[out_ptr1_c4], #128]           \n\t"
-                  "pld        [%[out_ptr1_c5], #128]           \n\t"
-                  "pld        [%[out_ptr1_c6], #128]           \n\t"
-                  "pld        [%[out_ptr1_c7], #128]           \n\t"
-                  "pld        [%[out_ptr1_c8], #128]           \n\t"
-
-                  "vld1.f32   {d16, d17}, [%[out_ptr1]]       \n\t"
-                  "vld1.f32   {d18, d19}, [%[out_ptr1_c2]]     \n\t"
-                  "vld1.f32   {d20, d21}, [%[out_ptr1_c3]]     \n\t"
-                  "vld1.f32   {d22, d23}, [%[out_ptr1_c4]]     \n\t"
-                  "vld1.f32   {d24, d25}, [%[out_ptr1_c5]]     \n\t"
-                  "vld1.f32   {d26, d27}, [%[out_ptr1_c6]]     \n\t"
-                  "vld1.f32   {d28, d29}, [%[out_ptr1_c7]]     \n\t"
-                  "vld1.f32   {d30, d31}, [%[out_ptr1_c8]]     \n\t"
-
-                  // in_ptr1 multiply
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q4, d0[0]                   \n\t"
-                  "vmla.f32   q9, q4, d0[1]                   \n\t"
-
-                  "vmla.f32   q10, q4, d1[0]                  \n\t"
-                  "vmla.f32   q11, q4, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q4, d2[0]                  \n\t"
-                  "vmla.f32   q13, q4, d2[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q4, d3[0]                  \n\t"
-                  "vmla.f32   q15, q4, d3[1]                  \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q5, d4[0]                   \n\t"
-                  "vmla.f32   q9, q5, d4[1]                   \n\t"
-
-                  "vext.32    q7, q4, q6, #1                  \n\t"
-                  "vmla.f32   q10, q5, d5[0]                  \n\t"
-                  "vmla.f32   q11, q5, d5[1]                  \n\t"
-
-                  "vmla.f32   q12, q5, d6[0]                  \n\t"
-                  "vmla.f32   q13, q5, d6[1]                  \n\t"
-
-                  "pld        [%[in_ptr1], #288]              \n\t"
-                  "vmla.f32   q14, q5, d7[0]                  \n\t"
-                  "vmla.f32   q15, q5, d7[1]                  \n\t"
-
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vmla.f32   q8, q7, d0[0]                   \n\t"
-                  "vmla.f32   q9, q7, d0[1]                   \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q10, q7, d1[0]                  \n\t"
-                  "vmla.f32   q11, q7, d1[1]                  \n\t"
-
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "add        %[in_ptr1], %[in_stride]        \n\t"
-                  "vmla.f32   q12, q7, d2[0]                  \n\t"
-                  "vmla.f32   q13, q7, d2[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q7, d3[0]                  \n\t"
-                  "vmla.f32   q15, q7, d3[1]                  \n\t"
-
-                  // in_ptr2 multiply
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q4, d4[0]                   \n\t"
-                  "vmla.f32   q9, q4, d4[1]                   \n\t"
-
-                  "vmla.f32   q10, q4, d5[0]                  \n\t"
-                  "vmla.f32   q11, q4, d5[1]                  \n\t"
-
-                  "vmla.f32   q12, q4, d6[0]                  \n\t"
-                  "vmla.f32   q13, q4, d6[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q4, d7[0]                  \n\t"
-                  "vmla.f32   q15, q4, d7[1]                  \n\t"
-
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q5, d0[0]                   \n\t"
-                  "vmla.f32   q9, q5, d0[1]                   \n\t"
-
-                  "vext.32    q7, q4, q6, #1                  \n\t"
-                  "vmla.f32   q10, q5, d1[0]                  \n\t"
-                  "vmla.f32   q11, q5, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q5, d2[0]                  \n\t"
-                  "vmla.f32   q13, q5, d2[1]                  \n\t"
-
-                  "pld        [%[in_ptr1], #288]              \n\t"
-                  "vmla.f32   q14, q5, d3[0]                  \n\t"
-                  "vmla.f32   q15, q5, d3[1]                  \n\t"
-
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vmla.f32   q8, q7, d4[0]                   \n\t"
-                  "vmla.f32   q9, q7, d4[1]                   \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q10, q7, d5[0]                  \n\t"
-                  "vmla.f32   q11, q7, d5[1]                  \n\t"
-
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "sub        %[in_ptr1], %[in_stride]        \n\t"
-                  "sub        %[in_ptr1], %[in_stride]        \n\t"
-                  "vmla.f32   q12, q7, d6[0]                  \n\t"
-                  "vmla.f32   q13, q7, d6[1]                  \n\t"
-
-                  "sub        %[in_ptr1], #64                 \n\t"
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q7, d7[0]                  \n\t"
-                  "vmla.f32   q15, q7, d7[1]                  \n\t"
-
-                  // in_ptr3 multiply
-                  "vld1.f32   {d4-d7}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q4, d0[0]                   \n\t"
-                  "vmla.f32   q9, q4, d0[1]                   \n\t"
-
-                  "vmla.f32   q10, q4, d1[0]                  \n\t"
-                  "vmla.f32   q11, q4, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q4, d2[0]                  \n\t"
-                  "vmla.f32   q13, q4, d2[1]                  \n\t"
-
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vmla.f32   q14, q4, d3[0]                  \n\t"
-                  "vmla.f32   q15, q4, d3[1]                  \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-                  "vmla.f32   q8, q5, d4[0]                   \n\t"
-                  "vmla.f32   q9, q5, d4[1]                   \n\t"
-
-                  "vext.32    q7, q4, q6, #1                  \n\t"
-                  "vmla.f32   q10, q5, d5[0]                  \n\t"
-                  "vmla.f32   q11, q5, d5[1]                  \n\t"
-
-                  "vmla.f32   q12, q5, d6[0]                  \n\t"
-                  "vmla.f32   q13, q5, d6[1]                  \n\t"
-
-                  "vmla.f32   q14, q5, d7[0]                  \n\t"
-                  "vmla.f32   q15, q5, d7[1]                  \n\t"
-
-                  "sub        %[f1], %[f1], #288              \n\t"
-                  "vmla.f32   q8, q7, d0[0]                   \n\t"
-                  "vmla.f32   q9, q7, d0[1]                   \n\t"
-
-                  "vmla.f32   q10, q7, d1[0]                  \n\t"
-                  "vmla.f32   q11, q7, d1[1]                  \n\t"
-
-                  "vmla.f32   q12, q7, d2[0]                  \n\t"
-                  "vmla.f32   q13, q7, d2[1]                  \n\t"
-
-                  "vmla.f32   q14, q7, d3[0]                  \n\t"
-                  "vmla.f32   q15, q7, d3[1]                  \n\t"
-
-                  // store out_ptr
-                  "pld        [%[f1], #256]                   \n\t"
-                  "vld1.f32   {d0-d3}, [%[f1]]!               \n\t"
-
-                  "pld        [%[in_ptr1], #288]              \n\t"
-                  "vld2.f32   {d8-d11}, [%[in_ptr1]]!         \n\t"
-                  "vst1.f32   {d16, d17}, [%[out_ptr1]]!      \n\t"
-                  "vst1.f32   {d18, d19}, [%[out_ptr1_c2]]!    \n\t"
-
-                  "vst1.f32   {d20, d21}, [%[out_ptr1_c3]]!    \n\t"
-                  "vst1.f32   {d22, d23}, [%[out_ptr1_c4]]!    \n\t"
-
-                  "vst1.f32   {d24, d25}, [%[out_ptr1_c5]]!    \n\t"
-                  "vst1.f32   {d26, d27}, [%[out_ptr1_c6]]!    \n\t"
-
-                  "vld2.f32   {d12, d13}, [%[in_ptr1]]        \n\t"
-                  "add        %[in_ptr1], %[in_stride]        \n\t"
-                  "vst1.f32   {d28, d29}, [%[out_ptr1_c7]]!    \n\t"
-
-                  "subs       %[loop], #1                 \n\t"
-                  "vst1.f32   {d30, d31}, [%[out_ptr1_c8]]!    \n\t"
-
-                  // cycle
-                  "bne        0b                              \n\t"
-                  "sub        %[f1], %[f1], #32               \n\t"
-                  "sub        %[in_ptr1], %[in_ptr1], #32     \n\t"
-                  "sub        %[in_ptr1], %[in_stride]        \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [out_ptr1_c2] "+r"(out_ptr1_c2),
-                    [out_ptr1_c3] "+r"(out_ptr1_c3),
-                    [out_ptr1_c4] "+r"(out_ptr1_c4),
-                    [out_ptr1_c5] "+r"(out_ptr1_c5),
-                    [out_ptr1_c6] "+r"(out_ptr1_c6),
-                    [out_ptr1_c7] "+r"(out_ptr1_c7),
-                    [out_ptr1_c8] "+r"(out_ptr1_c8), [in_ptr1] "+r"(in_ptr1)
-                  : [f1] "r"(f1), [in_stride] "r"(in_stride)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-
-              in_ptr2 = in_ptr1 + input_w;
-              in_ptr3 = in_ptr2 + input_w;
-            }
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-            float sum1_c2 = 0;
-            float sum1_c3 = 0;
-            float sum1_c4 = 0;
-            float sum1_c5 = 0;
-            float sum1_c6 = 0;
-            float sum1_c7 = 0;
-            float sum1_c8 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _pad_filter1_c2 = vld1q_f32(pad_filter1_c2);
-            float32x4_t _pad_filter1_c3 = vld1q_f32(pad_filter1_c3);
-            float32x4_t _pad_filter1_c4 = vld1q_f32(pad_filter1_c4);
-            float32x4_t _pad_filter1_c5 = vld1q_f32(pad_filter1_c5);
-            float32x4_t _pad_filter1_c6 = vld1q_f32(pad_filter1_c6);
-            float32x4_t _pad_filter1_c7 = vld1q_f32(pad_filter1_c7);
-            float32x4_t _pad_filter1_c8 = vld1q_f32(pad_filter1_c8);
-
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-            float32x4_t _sum1_c2 = vmulq_f32(_in_ptr1, _pad_filter1_c2);
-            float32x4_t _sum1_c3 = vmulq_f32(_in_ptr1, _pad_filter1_c3);
-            float32x4_t _sum1_c4 = vmulq_f32(_in_ptr1, _pad_filter1_c4);
-            float32x4_t _sum1_c5 = vmulq_f32(_in_ptr1, _pad_filter1_c5);
-            float32x4_t _sum1_c6 = vmulq_f32(_in_ptr1, _pad_filter1_c6);
-            float32x4_t _sum1_c7 = vmulq_f32(_in_ptr1, _pad_filter1_c7);
-            float32x4_t _sum1_c8 = vmulq_f32(_in_ptr1, _pad_filter1_c8);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            float32x4_t _pad_filter2_c2 = vld1q_f32(pad_filter2_c2);
-            float32x4_t _pad_filter2_c3 = vld1q_f32(pad_filter2_c3);
-            float32x4_t _pad_filter2_c4 = vld1q_f32(pad_filter2_c4);
-            float32x4_t _pad_filter2_c5 = vld1q_f32(pad_filter2_c5);
-            float32x4_t _pad_filter2_c6 = vld1q_f32(pad_filter2_c6);
-            float32x4_t _pad_filter2_c7 = vld1q_f32(pad_filter2_c7);
-            float32x4_t _pad_filter2_c8 = vld1q_f32(pad_filter2_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr2, _pad_filter2_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr2, _pad_filter2_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr2, _pad_filter2_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr2, _pad_filter2_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr2, _pad_filter2_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr2, _pad_filter2_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr2, _pad_filter2_c8);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            float32x4_t _pad_filter3_c2 = vld1q_f32(pad_filter3_c2);
-            float32x4_t _pad_filter3_c3 = vld1q_f32(pad_filter3_c3);
-            float32x4_t _pad_filter3_c4 = vld1q_f32(pad_filter3_c4);
-            float32x4_t _pad_filter3_c5 = vld1q_f32(pad_filter3_c5);
-            float32x4_t _pad_filter3_c6 = vld1q_f32(pad_filter3_c6);
-            float32x4_t _pad_filter3_c7 = vld1q_f32(pad_filter3_c7);
-            float32x4_t _pad_filter3_c8 = vld1q_f32(pad_filter3_c8);
-
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-            _sum1_c2 = vmlaq_f32(_sum1_c2, _in_ptr3, _pad_filter3_c2);
-            _sum1_c3 = vmlaq_f32(_sum1_c3, _in_ptr3, _pad_filter3_c3);
-            _sum1_c4 = vmlaq_f32(_sum1_c4, _in_ptr3, _pad_filter3_c4);
-            _sum1_c5 = vmlaq_f32(_sum1_c5, _in_ptr3, _pad_filter3_c5);
-            _sum1_c6 = vmlaq_f32(_sum1_c6, _in_ptr3, _pad_filter3_c6);
-            _sum1_c7 = vmlaq_f32(_sum1_c7, _in_ptr3, _pad_filter3_c7);
-            _sum1_c8 = vmlaq_f32(_sum1_c8, _in_ptr3, _pad_filter3_c8);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            _sum1_c2 = vsetq_lane_f32(sum1_c2, _sum1_c2, 3);
-            _sum1_c3 = vsetq_lane_f32(sum1_c3, _sum1_c3, 3);
-            _sum1_c4 = vsetq_lane_f32(sum1_c4, _sum1_c4, 3);
-            _sum1_c5 = vsetq_lane_f32(sum1_c5, _sum1_c5, 3);
-            _sum1_c6 = vsetq_lane_f32(sum1_c6, _sum1_c6, 3);
-            _sum1_c7 = vsetq_lane_f32(sum1_c7, _sum1_c7, 3);
-            _sum1_c8 = vsetq_lane_f32(sum1_c8, _sum1_c8, 3);
-
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ss1_2 =
-                vadd_f32(vget_low_f32(_sum1_c2), vget_high_f32(_sum1_c2));
-            float32x2_t _ss1_3 =
-                vadd_f32(vget_low_f32(_sum1_c3), vget_high_f32(_sum1_c3));
-            float32x2_t _ss1_4 =
-                vadd_f32(vget_low_f32(_sum1_c4), vget_high_f32(_sum1_c4));
-            float32x2_t _ss1_5 =
-                vadd_f32(vget_low_f32(_sum1_c5), vget_high_f32(_sum1_c5));
-            float32x2_t _ss1_6 =
-                vadd_f32(vget_low_f32(_sum1_c6), vget_high_f32(_sum1_c6));
-            float32x2_t _ss1_7 =
-                vadd_f32(vget_low_f32(_sum1_c7), vget_high_f32(_sum1_c7));
-            float32x2_t _ss1_8 =
-                vadd_f32(vget_low_f32(_sum1_c8), vget_high_f32(_sum1_c8));
-
-            float32x2_t _ssss1_ssss1_2 = vpadd_f32(_ss1, _ss1_2);
-            float32x2_t _ssss1_3_ssss1_4 = vpadd_f32(_ss1_3, _ss1_4);
-            float32x2_t _ssss1_5_ssss1_6 = vpadd_f32(_ss1_5, _ss1_6);
-            float32x2_t _ssss1_7_ssss1_8 = vpadd_f32(_ss1_7, _ss1_8);
-
-            sum1 += vget_lane_f32(_ssss1_ssss1_2, 0);
-            sum1_c2 += vget_lane_f32(_ssss1_ssss1_2, 1);
-            sum1_c3 += vget_lane_f32(_ssss1_3_ssss1_4, 0);
-            sum1_c4 += vget_lane_f32(_ssss1_3_ssss1_4, 1);
-            sum1_c5 += vget_lane_f32(_ssss1_5_ssss1_6, 0);
-            sum1_c6 += vget_lane_f32(_ssss1_5_ssss1_6, 1);
-            sum1_c7 += vget_lane_f32(_ssss1_7_ssss1_8, 0);
-            sum1_c8 += vget_lane_f32(_ssss1_7_ssss1_8, 1);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-
-            sum1_c2 += in_ptr1[0] * pad_filter1_c2[0];
-            sum1_c2 += in_ptr1[1] * pad_filter1_c2[1];
-            sum1_c2 += in_ptr1[2] * pad_filter1_c2[2];
-            sum1_c2 += in_ptr2[0] * pad_filter2_c2[0];
-            sum1_c2 += in_ptr2[1] * pad_filter2_c2[1];
-            sum1_c2 += in_ptr2[2] * pad_filter2_c2[2];
-            sum1_c2 += in_ptr3[0] * pad_filter3_c2[0];
-            sum1_c2 += in_ptr3[1] * pad_filter3_c2[1];
-            sum1_c2 += in_ptr3[2] * pad_filter3_c2[2];
-
-            sum1_c3 += in_ptr1[0] * pad_filter1_c3[0];
-            sum1_c3 += in_ptr1[1] * pad_filter1_c3[1];
-            sum1_c3 += in_ptr1[2] * pad_filter1_c3[2];
-            sum1_c3 += in_ptr2[0] * pad_filter2_c3[0];
-            sum1_c3 += in_ptr2[1] * pad_filter2_c3[1];
-            sum1_c3 += in_ptr2[2] * pad_filter2_c3[2];
-            sum1_c3 += in_ptr3[0] * pad_filter3_c3[0];
-            sum1_c3 += in_ptr3[1] * pad_filter3_c3[1];
-            sum1_c3 += in_ptr3[2] * pad_filter3_c3[2];
-
-            sum1_c4 += in_ptr1[0] * pad_filter1_c4[0];
-            sum1_c4 += in_ptr1[1] * pad_filter1_c4[1];
-            sum1_c4 += in_ptr1[2] * pad_filter1_c4[2];
-            sum1_c4 += in_ptr2[0] * pad_filter2_c4[0];
-            sum1_c4 += in_ptr2[1] * pad_filter2_c4[1];
-            sum1_c4 += in_ptr2[2] * pad_filter2_c4[2];
-            sum1_c4 += in_ptr3[0] * pad_filter3_c4[0];
-            sum1_c4 += in_ptr3[1] * pad_filter3_c4[1];
-            sum1_c4 += in_ptr3[2] * pad_filter3_c4[2];
-
-            sum1_c5 += in_ptr1[0] * pad_filter1_c5[0];
-            sum1_c5 += in_ptr1[1] * pad_filter1_c5[1];
-            sum1_c5 += in_ptr1[2] * pad_filter1_c5[2];
-            sum1_c5 += in_ptr2[0] * pad_filter2_c5[0];
-            sum1_c5 += in_ptr2[1] * pad_filter2_c5[1];
-            sum1_c5 += in_ptr2[2] * pad_filter2_c5[2];
-            sum1_c5 += in_ptr3[0] * pad_filter3_c5[0];
-            sum1_c5 += in_ptr3[1] * pad_filter3_c5[1];
-            sum1_c5 += in_ptr3[2] * pad_filter3_c5[2];
-
-            sum1_c6 += in_ptr1[0] * pad_filter1_c6[0];
-            sum1_c6 += in_ptr1[1] * pad_filter1_c6[1];
-            sum1_c6 += in_ptr1[2] * pad_filter1_c6[2];
-            sum1_c6 += in_ptr2[0] * pad_filter2_c6[0];
-            sum1_c6 += in_ptr2[1] * pad_filter2_c6[1];
-            sum1_c6 += in_ptr2[2] * pad_filter2_c6[2];
-            sum1_c6 += in_ptr3[0] * pad_filter3_c6[0];
-            sum1_c6 += in_ptr3[1] * pad_filter3_c6[1];
-            sum1_c6 += in_ptr3[2] * pad_filter3_c6[2];
-
-            sum1_c7 += in_ptr1[0] * pad_filter1_c7[0];
-            sum1_c7 += in_ptr1[1] * pad_filter1_c7[1];
-            sum1_c7 += in_ptr1[2] * pad_filter1_c7[2];
-            sum1_c7 += in_ptr2[0] * pad_filter2_c7[0];
-            sum1_c7 += in_ptr2[1] * pad_filter2_c7[1];
-            sum1_c7 += in_ptr2[2] * pad_filter2_c7[2];
-            sum1_c7 += in_ptr3[0] * pad_filter3_c7[0];
-            sum1_c7 += in_ptr3[1] * pad_filter3_c7[1];
-            sum1_c7 += in_ptr3[2] * pad_filter3_c7[2];
-
-            sum1_c8 += in_ptr1[0] * pad_filter1_c8[0];
-            sum1_c8 += in_ptr1[1] * pad_filter1_c8[1];
-            sum1_c8 += in_ptr1[2] * pad_filter1_c8[2];
-            sum1_c8 += in_ptr2[0] * pad_filter2_c8[0];
-            sum1_c8 += in_ptr2[1] * pad_filter2_c8[1];
-            sum1_c8 += in_ptr2[2] * pad_filter2_c8[2];
-            sum1_c8 += in_ptr3[0] * pad_filter3_c8[0];
-            sum1_c8 += in_ptr3[1] * pad_filter3_c8[1];
-            sum1_c8 += in_ptr3[2] * pad_filter3_c8[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-              pad_filter1_c2--;
-              pad_filter2_c2--;
-              pad_filter3_c2--;
-
-              pad_filter1_c3--;
-              pad_filter2_c3--;
-              pad_filter3_c3--;
-              pad_filter1_c4--;
-              pad_filter2_c4--;
-              pad_filter3_c4--;
-
-              pad_filter1_c5--;
-              pad_filter2_c5--;
-              pad_filter3_c5--;
-              pad_filter1_c6--;
-              pad_filter2_c6--;
-              pad_filter3_c6--;
-
-              pad_filter1_c7--;
-              pad_filter2_c7--;
-              pad_filter3_c7--;
-              pad_filter1_c8--;
-              pad_filter2_c8--;
-              pad_filter3_c8--;
-
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-              pad_filter1_c2 -= 2;
-              pad_filter2_c2 -= 2;
-              pad_filter3_c2 -= 2;
-
-              pad_filter1_c3 -= 2;
-              pad_filter2_c3 -= 2;
-              pad_filter3_c3 -= 2;
-              pad_filter1_c4 -= 2;
-              pad_filter2_c4 -= 2;
-              pad_filter3_c4 -= 2;
-
-              pad_filter1_c5 -= 2;
-              pad_filter2_c5 -= 2;
-              pad_filter3_c5 -= 2;
-              pad_filter1_c6 -= 2;
-              pad_filter2_c6 -= 2;
-              pad_filter3_c6 -= 2;
-
-              pad_filter1_c7 -= 2;
-              pad_filter2_c7 -= 2;
-              pad_filter3_c7 -= 2;
-              pad_filter1_c8 -= 2;
-              pad_filter2_c8 -= 2;
-              pad_filter3_c8 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            *out_ptr1_c2 += sum1_c2;
-            *out_ptr1_c3 += sum1_c3;
-            *out_ptr1_c4 += sum1_c4;
-            *out_ptr1_c5 += sum1_c5;
-            *out_ptr1_c6 += sum1_c6;
-            *out_ptr1_c7 += sum1_c7;
-            *out_ptr1_c8 += sum1_c8;
-
-            out_ptr1++;
-            out_ptr1_c2++;
-            out_ptr1_c3++;
-            out_ptr1_c4++;
-            out_ptr1_c5++;
-            out_ptr1_c6++;
-            out_ptr1_c7++;
-            out_ptr1_c8++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += remain_stride_w + input_w;
-            in_ptr2 += remain_stride_w + input_w;
-            in_ptr3 += remain_stride_w + input_w;
-
-          } else if (input_h > 3 &&
-                     (if_odd_pad_h && o_h == valid_h_start ||
-                      o_h == valid_h_end && if_odd_pad_h && if_exact_in_h ||
-                      o_h == valid_h_end + 1 && !if_odd_pad_h &&
-                          !if_exact_in_h)) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-
-            pad_filter1 -= remain_stride_w;
-            pad_filter2 -= remain_stride_w;
-            pad_filter3 -= remain_stride_w;
-            pad_filter1_c2 -= remain_stride_w;
-            pad_filter2_c2 -= remain_stride_w;
-            pad_filter3_c2 -= remain_stride_w;
-
-            pad_filter1_c3 -= remain_stride_w;
-            pad_filter2_c3 -= remain_stride_w;
-            pad_filter3_c3 -= remain_stride_w;
-            pad_filter1_c4 -= remain_stride_w;
-            pad_filter2_c4 -= remain_stride_w;
-            pad_filter3_c4 -= remain_stride_w;
-
-            pad_filter1_c5 -= remain_stride_w;
-            pad_filter2_c5 -= remain_stride_w;
-            pad_filter3_c5 -= remain_stride_w;
-            pad_filter1_c6 -= remain_stride_w;
-            pad_filter2_c6 -= remain_stride_w;
-            pad_filter3_c6 -= remain_stride_w;
-
-            pad_filter1_c7 -= remain_stride_w;
-            pad_filter2_c7 -= remain_stride_w;
-            pad_filter3_c7 -= remain_stride_w;
-            pad_filter1_c8 -= remain_stride_w;
-            pad_filter2_c8 -= remain_stride_w;
-            pad_filter3_c8 -= remain_stride_w;
-          } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) {
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-
-            pad_filter1 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c2 -= 3 + 2 * padding_w + remain_stride_w;
-
-            pad_filter1_c3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c3 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c4 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c4 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c4 -= 3 + 2 * padding_w + remain_stride_w;
-
-            pad_filter1_c5 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c5 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c5 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c6 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c6 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c6 -= 3 + 2 * padding_w + remain_stride_w;
-
-            pad_filter1_c7 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c7 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c7 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter1_c8 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2_c8 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3_c8 -= 3 + 2 * padding_w + remain_stride_w;
-          } else {
-            pad_filter1 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c2 += 3 + 2 * padding_w - remain_stride_w;
-
-            pad_filter1_c3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c3 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c4 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c4 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c4 += 3 + 2 * padding_w - remain_stride_w;
-
-            pad_filter1_c5 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c5 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c5 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c6 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c6 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c6 += 3 + 2 * padding_w - remain_stride_w;
-
-            pad_filter1_c7 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c7 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c7 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter1_c8 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2_c8 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3_c8 += 3 + 2 * padding_w - remain_stride_w;
-
-            in_ptr1 += input_w + 3;
-            in_ptr2 += input_w + 3;
-            in_ptr3 += input_w + 3;
-          }
-        }
-
-        filter_data_ch += filter_ch_size;
-        filter_data_ch_c2 += filter_ch_size;
-        filter_data_ch_c3 += filter_ch_size;
-        filter_data_ch_c4 += filter_ch_size;
-        filter_data_ch_c5 += filter_ch_size;
-        filter_data_ch_c6 += filter_ch_size;
-        filter_data_ch_c7 += filter_ch_size;
-        filter_data_ch_c8 += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-
-    int out_ch_remain_start = output_ch - output_ch % 8;
-
-    // remain output_channel
-#pragma omp parallel for
-    for (int o_c = out_ch_remain_start; o_c < output_ch; ++o_c) {
-      const float *f1, *f9;
-      const float *in_ptr1, *in_ptr2, *in_ptr3;
-      const float *pad_filter1, *pad_filter2, *pad_filter3;
-      float pad_filter_arr[pad_filter_ch_size];
-      float *output_data_ch;
-      const float *input_data_ch;
-      const float *filter_data_ch;
-
-      filter_data_ch = filter_data + o_c * filter_ch_size * input_ch;
-      input_data_ch = input_data;
-      output_data_ch = output_data + o_c * out_ch_size;
-
-      for (int i_c = 0; i_c < input_ch; ++i_c) {
-        f1 = filter_data_ch;
-        f9 = f1 + 8;
-
-        if (!if_nopadding) {
-          memset(pad_filter_arr, 0.f, sizeof(pad_filter_arr));
-          for (int i = 0; i < 9; ++i) {
-            int j = i / 3 * (2 * padding_w + 3) + i % 3 + padding_h * 3 +
-                    padding_w * (2 * padding_h + 1);
-            pad_filter_arr[j] = filter_data_ch[i];
-          }
-          pad_filter1 = pad_filter_arr;
-          pad_filter1 += pad_filter_start;
-          pad_filter2 = pad_filter1 + pad_filter_w;
-          pad_filter3 = pad_filter2 + pad_filter_w;
-        } else {
-          pad_filter1 = filter_data_ch;
-          pad_filter2 = pad_filter1 + 3;
-          pad_filter3 = pad_filter2 + 3;
-        }
-
-        float *out_ptr1;
-        out_ptr1 = output_data_ch;
-        in_ptr1 = input_data_ch;
-        in_ptr2 = in_ptr1 + input_w;
-        in_ptr3 = in_ptr2 + input_w;
-
-        int o_h = 0;
-        for (; o_h < output_h; ++o_h) {
-          int o_w = 0;
-
-          // pad left
-          for (; o_w <= valid_w_start; ++o_w) {
-            float sum1 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1);
-            sum1 += vget_lane_f32(_ssss1_ssss1, 0);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            out_ptr1++;
-          }
-            // valid
-#if __ARM_NEON
-#if __aarch64__
-          if (o_h > valid_h_start && o_h < valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "prfm   pldl1keep, [%[f1], #256]            \n\t"
-                  "prfm   pldl1keep, [%[f9], #256]            \n\t"
-
-                  "ld1   {v0.4s, v1.4s}, [%[f1]]              \n\t"
-                  "ld1   {v4.s}[0], [%[f9]]                   \n\t"
-
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "prfm   pldl1keep, [%[out_ptr1], #128]      \n\t"
-                  "ld1   {v12.4s}, [%[out_ptr1]]              \n\t"
-
-                  // in_ptr1 multiply
-                  "prfm   pldl1keep, [%[in_ptr1], #256]       \n\t"
-                  "ld2   {v5.4s, v6.4s}, [%[in_ptr1]], #32    \n\t"
-                  "ld2   {v7.4s, v8.4s}, [%[in_ptr1]]         \n\t"
-
-                  "fmla   v12.4s, v5.4s, v0.s[0]              \n\t"
-                  "fmla   v14.4s, v5.4s, v2.s[0]              \n\t"
-
-                  "ext    v8.16b, v5.16b, v7.16b, #4          \n\t"
-                  "fmul   v13.4s, v6.4s, v0.s[1]              \n\t"
-                  "fmla   v12.4s, v8.4s, v0.s[2]              \n\t"
-
-                  "ld2   {v5.4s, v6.4s}, [%[in_ptr2]], #32    \n\t"
-                  "ld2   {v7.4s, v8.4s}, [%[in_ptr2]]         \n\t"
-
-                  // in_ptr2 multiply
-                  "fmla   v13.4s, v5.4s, v0.s[3]              \n\t"
-                  "ext    v8.16b, v5.16b, v7.16b, #4          \n\t"
-                  "fmla   v12.4s, v6.4s, v1.s[0]              \n\t"
-
-                  "fmla   v13.4s, v8.4s, v1.s[1]              \n\t"
-                  "ld2   {v5.4s, v6.4s}, [%[in_ptr3]], #32    \n\t"
-                  "ld2   {v7.4s, v8.4s}, [%[in_ptr3]]         \n\t"
-
-                  // in_ptr3 multiply
-                  "fmla   v12.4s, v5.4s, v1.s[2]              \n\t"
-                  "ext    v8.16b, v5.16b, v7.16b, #4          \n\t"
-
-                  "fmla   v13.4s, v6.4s, v1.s[3]              \n\t"
-                  "fmla   v12.4s, v8.4s, v4.s[0]              \n\t"
-
-                  // store out_ptr
-                  "fadd   v12.4s, v12.4s, v13.4s              \n\t"
-                  "st1   {v12.4s}, [%[out_ptr1]], #16         \n\t"
-
-                  // cycle
-                  "subs       %[loop], %[loop], #1      \n\t"
-                  "bne        0b                              \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2),
-                    [in_ptr3] "+r"(in_ptr3)
-                  : [f1] "r"(f1), [f9] "r"(f9)
-                  : "cc", "memory", "v0", "v1", "v4", "v5", "v6", "v7", "v8",
-                    "v12", "v13");
-            }
-          }
-#else
-          if (o_h > valid_h_start && o_h < valid_h_end) {
-            int loop = (valid_w_end - valid_w_start - 1) >> 2;
-            o_w += loop * 4;
-
-            if (loop > 0) {
-              asm volatile(
-                  "pld        [%[f1], #256]                   \n\t"
-                  "pld        [%[f9], #256]                   \n\t"
-
-                  "vld1.f32   {d0-d3}, [%[f1]]                \n\t"
-                  "vld1.f32   {d8[0]}, [%[f9]]                \n\t"
-
-                  "pld        [%[in_ptr1], #256]              \n\t"
-                  "vld2.f32   {d10-d13}, [%[in_ptr1]]!        \n\t"
-                  "vld2.f32   {d14, d15}, [%[in_ptr1]]        \n\t"
-
-                  "0:                                         \n\t"
-                  // load out_ptr
-                  "pld        [%[out_ptr1], #128]             \n\t"
-                  "vld1.f32   {d24, d25}, [%[out_ptr1]]       \n\t"
-
-                  // in_ptr1 multiply
-                  "pld        [%[in_ptr2], #256]              \n\t"
-                  "vld2.f32   {d4-d7}, [%[in_ptr2]]!          \n\t"
-
-                  "vmla.f32   q12, q5, d0[0]                  \n\t"
-                  "vld2.f32   {d20, d21}, [%[in_ptr2]]        \n\t"
-                  "vext.32    q8, q5, q7, #1                  \n\t"
-
-                  "pld        [%[in_ptr3], #256]              \n\t"
-                  "vmul.f32   q13, q6, d0[1]                  \n\t"
-
-                  "vld2.f32   {d10-d13}, [%[in_ptr3]]!        \n\t"
-                  "vmul.f32   q14, q8, d1[0]                  \n\t"
-                  "vld2.f32   {d14, d15}, [%[in_ptr3]]        \n\t"
-
-                  // in_ptr2 multiply
-                  "vmul.f32   q15, q2, d1[1]                  \n\t"
-                  "vext.32    q8, q2, q10, #1                 \n\t"
-
-                  "vmla.f32   q12, q3, d2[0]                  \n\t"
-                  "vmla.f32   q13, q8, d2[1]                  \n\t"
-
-                  // in_ptr3 multiply
-                  "vmla.f32   q14, q5, d3[0]                  \n\t"
-                  "vext.32    q8, q5, q7, #1                  \n\t"
-
-                  "pld        [%[in_ptr1], #256]              \n\t"
-                  "vmla.f32   q15, q6, d3[1]                  \n\t"
-
-                  "vld2.f32   {d10-d13}, [%[in_ptr1]]!        \n\t"
-                  "vmla.f32   q13, q8, d8[0]                  \n\t"
-
-                  // store out_ptr
-                  "vld2.f32   {d14, d15}, [%[in_ptr1]]        \n\t"
-                  "vadd.f32   q12, q12, q13                   \n\t"
-                  "subs       %[loop], #1                 \n\t"
-
-                  "vadd.f32   q14, q14, q15                   \n\t"
-                  "vadd.f32   q12, q12, q14                   \n\t"
-                  "vst1.f32   {d24, d25}, [%[out_ptr1]]!      \n\t"
-
-                  // cycle
-                  "bne        0b                              \n\t"
-                  "subs       %[in_ptr1], %[in_ptr1], #32     \n\t"
-
-                  : [loop] "+r"(loop), [out_ptr1] "+r"(out_ptr1),
-                    [in_ptr1] "+r"(in_ptr1), [in_ptr2] "+r"(in_ptr2),
-                    [in_ptr3] "+r"(in_ptr3)
-                  : [f1] "r"(f1), [f9] "r"(f9)
-                  : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                    "q7", "q8", "q10", "q12", "q13", "q14", "q15");
-            }
-          }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-          out_ptr1 -= 4;
-          out_ptr1 += 4;
-
-          // remain output_width
-          for (; o_w < output_w; ++o_w) {
-            float sum1 = 0;
-#if __ARM_NEON
-            float32x4_t _in_ptr1 = vld1q_f32(in_ptr1);
-            float32x4_t _pad_filter1 = vld1q_f32(pad_filter1);
-            float32x4_t _sum1 = vmulq_f32(_in_ptr1, _pad_filter1);
-
-            float32x4_t _in_ptr2 = vld1q_f32(in_ptr2);
-            float32x4_t _pad_filter2 = vld1q_f32(pad_filter2);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr2, _pad_filter2);
-
-            float32x4_t _in_ptr3 = vld1q_f32(in_ptr3);
-            float32x4_t _pad_filter3 = vld1q_f32(pad_filter3);
-            _sum1 = vmlaq_f32(_sum1, _in_ptr3, _pad_filter3);
-
-            _sum1 = vsetq_lane_f32(sum1, _sum1, 3);
-            float32x2_t _ss1 =
-                vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
-            float32x2_t _ssss1_ssss1 = vpadd_f32(_ss1, _ss1);
-            sum1 += vget_lane_f32(_ssss1_ssss1, 0);
-#else
-            sum1 += in_ptr1[0] * pad_filter1[0];
-            sum1 += in_ptr1[1] * pad_filter1[1];
-            sum1 += in_ptr1[2] * pad_filter1[2];
-            sum1 += in_ptr2[0] * pad_filter2[0];
-            sum1 += in_ptr2[1] * pad_filter2[1];
-            sum1 += in_ptr2[2] * pad_filter2[2];
-            sum1 += in_ptr3[0] * pad_filter3[0];
-            sum1 += in_ptr3[1] * pad_filter3[1];
-            sum1 += in_ptr3[2] * pad_filter3[2];
-#endif
-            if (if_nopadding) {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            } else if (input_w > 3 &&
-                       (if_odd_pad_w && o_w == valid_w_start ||
-                        o_w == valid_w_end && if_odd_pad_w && if_exact_in_w ||
-                        o_w == valid_w_end + 1 && !if_odd_pad_w &&
-                            !if_exact_in_w)) {
-              pad_filter1--;
-              pad_filter2--;
-              pad_filter3--;
-
-              in_ptr1++;
-              in_ptr2++;
-              in_ptr3++;
-
-            } else if (input_w <= 3 || o_w < valid_w_start ||
-                       o_w > valid_w_end) {
-              pad_filter1 -= 2;
-              pad_filter2 -= 2;
-              pad_filter3 -= 2;
-            } else {
-              in_ptr1 += 2;
-              in_ptr2 += 2;
-              in_ptr3 += 2;
-            }
-            *out_ptr1 += sum1;
-            out_ptr1++;
-          }
-          if (if_nopadding) {
-            in_ptr1 += remain_stride_w + input_w;
-            in_ptr2 += remain_stride_w + input_w;
-            in_ptr3 += remain_stride_w + input_w;
-          } else if (input_h > 3 &&
-                     (if_odd_pad_h && o_h == valid_h_start ||
-                      o_h == valid_h_end && if_odd_pad_h && if_exact_in_h ||
-                      o_h == valid_h_end + 1 && !if_odd_pad_h &&
-                          !if_exact_in_h)) {
-            in_ptr1 += 3;
-            in_ptr2 += 3;
-            in_ptr3 += 3;
-
-            pad_filter1 -= remain_stride_w;
-            pad_filter2 -= remain_stride_w;
-            pad_filter3 -= remain_stride_w;
-
-          } else if (input_h <= 3 || o_h < valid_h_start || o_h > valid_h_end) {
-            in_ptr1 -= input_w - 3;
-            in_ptr2 -= input_w - 3;
-            in_ptr3 -= input_w - 3;
-
-            pad_filter1 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter2 -= 3 + 2 * padding_w + remain_stride_w;
-            pad_filter3 -= 3 + 2 * padding_w + remain_stride_w;
-          } else {
-            pad_filter1 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter2 += 3 + 2 * padding_w - remain_stride_w;
-            pad_filter3 += 3 + 2 * padding_w - remain_stride_w;
-
-            in_ptr1 += input_w + 3;
-            in_ptr2 += input_w + 3;
-            in_ptr3 += input_w + 3;
-          }
-        }
-        filter_data_ch += filter_ch_size;
-        input_data_ch += in_ch_size;
-      }
-    }
-    input_data += in_batch_size;
-    output_data += out_batch_size;
-  }
-}
-
-template <>
-void SlidingwindowConv3x3s1Faster<float, float>(
-    const framework::Tensor *input, framework::Tensor *filter,
-    const std::vector<int> &paddings, framework::Tensor *output,
-    const float *bias, bool is_bias, bool is_relu) {
-  const float *din = input->data<float>();
-  float *dout = output->mutable_data<float>();
-  const float *weights = filter->mutable_data<float>();
-  if (!is_bias) {
-    bias = nullptr;
-  }
-  bool relu = is_relu;
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  const int pad_h = paddings[0];
-  const int pad_w = paddings[1];
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 2;
-  const int wout_block = 4;
-  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
-  const int win_round = wout_round + 2;
-
-  int hout_r_block = (l2_size - 2 * win_round * chin) /
-                     (win_round * chin + hout_c_block * wout_round * threads);
-  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block + 2;
-
-  float ptr_zero[win_round];
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float ptr_write[wout_round];
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  float *pre_din =
-      static_cast<float *>(framework::CPUContext::Context()->get_work_space(
-          (pre_in_size + threads * pre_out_size) * sizeof(float)));
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;               // kernel_w * kernel_h;
-  int w_stride_chin = hout_c_block * 9;  // kernel_w * kernel_h *
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int c_remain = chout - (chout / hout_c_block) * hout_c_block;
-  int c_round_down = (chout / hout_c_block) * hout_c_block;
-
-  int out_row_stride = hout_c_block * wout_round;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * chin * size_in_channel;
-    float *dout_batch = dout + n * chout * size_out_channel;
-    for (int h = 0; h < hout; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > hout) {
-        h_kernel = hout - h;
-      }
-      int hs = h - pad_h;
-      int he = hs + h_kernel + 2;
-      slidingwindow_prepack_input(din_batch, pre_din, 0, chin, hs, he, ws, we,
-                                  chin, win, hin, ptr_zero);
-#pragma omp parallel for
-      for (int c = 0; c < chout - (hout_c_block - 1); c += hout_c_block) {
-#ifdef _OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-        const float *block_inr0 = pre_din;
-        const float *block_inr1 = block_inr0 + in_len;
-        const float *block_inr2 = block_inr1 + in_len;
-        const float *block_inr3 = block_inr2 + in_len;
-
-        const float *weight_c = weights + c * w_stride;
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr,
-                                wout_round * hout_c_block * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_c;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-
-          float *pre_out0 = pre_out + hk * out_row_stride;
-          float *pre_out1 = pre_out0 + out_row_stride;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vld1q_f32(wc0);       // w0, v23
-            float32x4_t w1 = vld1q_f32(wc0 + 4);   // w1, v24
-            float32x4_t w2 = vld1q_f32(wc0 + 8);   // w2, v25
-            float32x4_t w3 = vld1q_f32(wc0 + 12);  // w3, v26
-            float32x4_t w4 = vld1q_f32(wc0 + 16);  // w4, v27
-            float32x4_t w5 = vld1q_f32(wc0 + 20);  // w5, v28
-            float32x4_t w6 = vld1q_f32(wc0 + 24);  // w6, v29
-            float32x4_t w7 = vld1q_f32(wc0 + 28);  // w7, v30
-            float32x4_t w8 = vld1q_f32(wc0 + 32);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr10, outr11*/
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r1*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r1, mul w0, get out r0, r1 */
-                "fmla   v15.4s ,  %[w0].4s,  v0.s[0]\n" /* outr00 = w0 * r0[0]*/
-                "fmla   v16.4s ,  %[w0].4s,  v0.s[1]\n" /* outr01 = w0 * r0[1]*/
-                "fmla   v17.4s ,  %[w0].4s,  v0.s[2]\n" /* outr02 = w0 * r0[2]*/
-                "fmla   v18.4s ,  %[w0].4s,  v0.s[3]\n" /* outr03 = w0 * r0[3]*/
-                "fmla   v19.4s ,  %[w0].4s,  v2.s[0]\n" /* outr10 = w0 * r1[0]*/
-                "fmla   v20.4s ,  %[w0].4s,  v2.s[1]\n" /* outr11 = w0 * r1[1]*/
-                "fmla   v21.4s ,  %[w0].4s,  v2.s[2]\n" /* outr12 = w0 * r1[2]*/
-                "fmla   v22.4s ,  %[w0].4s,  v2.s[3]\n" /* outr13 = w0 * r1[3]*/
-
-                /*  r0, r1, mul w1, get out r0, r1 */
-                "fmla   v15.4s ,  %[w1].4s,  v0.s[1]\n" /* outr00 = w1 * r0[1]*/
-                "fmla   v16.4s ,  %[w1].4s,  v0.s[2]\n" /* outr01 = w1 * r0[2]*/
-                "fmla   v17.4s ,  %[w1].4s,  v0.s[3]\n" /* outr02 = w1 * r0[3]*/
-                "fmla   v18.4s ,  %[w1].4s,  v1.s[0]\n" /* outr03 = w1 * r0[4]*/
-                "fmla   v19.4s ,  %[w1].4s,  v2.s[1]\n" /* outr10 = w1 * r1[1]*/
-                "fmla   v20.4s ,  %[w1].4s,  v2.s[2]\n" /* outr11 = w1 * r1[2]*/
-                "fmla   v21.4s ,  %[w1].4s,  v2.s[3]\n" /* outr12 = w1 * r1[3]*/
-                "fmla   v22.4s ,  %[w1].4s,  v3.s[0]\n" /* outr13 = w1 * r1[4]*/
-
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-
-                /*  r0, r1, mul w2, get out r0, r1 */
-                "fmla   v15.4s ,  %[w2].4s,  v0.s[2]\n" /* outr00 = w2 * r0[2]*/
-                "fmla   v16.4s ,  %[w2].4s,  v0.s[3]\n" /* outr01 = w2 * r0[3]*/
-                "fmla   v17.4s ,  %[w2].4s,  v1.s[0]\n" /* outr02 = w2 * r0[0]*/
-                "fmla   v18.4s ,  %[w2].4s,  v1.s[1]\n" /* outr03 = w2 * r0[1]*/
-                "fmla   v19.4s ,  %[w2].4s,  v2.s[2]\n" /* outr10 = w2 * r1[2]*/
-                "fmla   v20.4s ,  %[w2].4s,  v2.s[3]\n" /* outr11 = w2 * r1[3]*/
-                "fmla   v21.4s ,  %[w2].4s,  v3.s[0]\n" /* outr12 = w2 * r1[0]*/
-                "fmla   v22.4s ,  %[w2].4s,  v3.s[1]\n" /* outr13 = w2 * r1[1]*/
-
-                /*  r1, r2, mul w3, get out r0, r1 */
-                "fmla   v15.4s ,  %[w3].4s,  v2.s[0]\n" /* outr00 = w3 * r1[0]*/
-                "fmla   v16.4s ,  %[w3].4s,  v2.s[1]\n" /* outr01 = w3 * r1[1]*/
-                "fmla   v17.4s ,  %[w3].4s,  v2.s[2]\n" /* outr02 = w3 * r1[2]*/
-                "fmla   v18.4s ,  %[w3].4s,  v2.s[3]\n" /* outr03 = w3 * r1[3]*/
-                "fmla   v19.4s ,  %[w3].4s,  v4.s[0]\n" /* outr10 = w3 * r2[0]*/
-                "fmla   v20.4s ,  %[w3].4s,  v4.s[1]\n" /* outr11 = w3 * r2[1]*/
-                "fmla   v21.4s ,  %[w3].4s,  v4.s[2]\n" /* outr12 = w3 * r2[2]*/
-                "fmla   v22.4s ,  %[w3].4s,  v4.s[3]\n" /* outr13 = w3 * r2[3]*/
-
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load next input r0*/
-
-                /*  r1, r2, mul w4, get out r0, r1 */
-                "fmla   v15.4s ,  %[w4].4s,  v2.s[1]\n" /* outr00 = w4 * r1[1]*/
-                "fmla   v16.4s ,  %[w4].4s,  v2.s[2]\n" /* outr01 = w4 * r1[2]*/
-                "fmla   v17.4s ,  %[w4].4s,  v2.s[3]\n" /* outr02 = w4 * r1[3]*/
-                "fmla   v18.4s ,  %[w4].4s,  v3.s[0]\n" /* outr03 = w4 * r1[4]*/
-                "fmla   v19.4s ,  %[w4].4s,  v4.s[1]\n" /* outr10 = w4 * r2[1]*/
-                "fmla   v20.4s ,  %[w4].4s,  v4.s[2]\n" /* outr11 = w4 * r2[2]*/
-                "fmla   v21.4s ,  %[w4].4s,  v4.s[3]\n" /* outr12 = w4 * r2[3]*/
-                "fmla   v22.4s ,  %[w4].4s,  v5.s[0]\n" /* outr13 = w4 * r2[4]*/
-
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-
-                /*  r1, r2, mul w5, get out r0, r1 */
-                "fmla   v15.4s ,  %[w5].4s,  v2.s[2]\n" /* outr00 = w5 * r1[2]*/
-                "fmla   v16.4s ,  %[w5].4s,  v2.s[3]\n" /* outr01 = w5 * r1[3]*/
-                "fmla   v17.4s ,  %[w5].4s,  v3.s[0]\n" /* outr02 = w5 * r1[0]*/
-                "fmla   v18.4s ,  %[w5].4s,  v3.s[1]\n" /* outr03 = w5 * r1[1]*/
-                "fmla   v19.4s ,  %[w5].4s,  v4.s[2]\n" /* outr10 = w5 * r2[2]*/
-                "fmla   v20.4s ,  %[w5].4s,  v4.s[3]\n" /* outr11 = w5 * r2[3]*/
-                "fmla   v21.4s ,  %[w5].4s,  v5.s[0]\n" /* outr12 = w5 * r2[0]*/
-                "fmla   v22.4s ,  %[w5].4s,  v5.s[1]\n" /* outr13 = w5 * r2[1]*/
-
-                /*  r2, r3, mul w6, get out r0, r1 */
-                "fmla   v15.4s ,  %[w6].4s,  v4.s[0]\n" /* outr00 = w6 * r2[0]*/
-                "fmla   v16.4s ,  %[w6].4s,  v4.s[1]\n" /* outr01 = w6 * r2[1]*/
-                "fmla   v17.4s ,  %[w6].4s,  v4.s[2]\n" /* outr02 = w6 * r2[2]*/
-                "fmla   v18.4s ,  %[w6].4s,  v4.s[3]\n" /* outr03 = w6 * r2[3]*/
-                "fmla   v19.4s ,  %[w6].4s,  v6.s[0]\n" /* outr10 = w6 * r3[0]*/
-                "fmla   v20.4s ,  %[w6].4s,  v6.s[1]\n" /* outr11 = w6 * r3[1]*/
-                "fmla   v21.4s ,  %[w6].4s,  v6.s[2]\n" /* outr12 = w6 * r3[2]*/
-                "fmla   v22.4s ,  %[w6].4s,  v6.s[3]\n" /* outr13 = w6 * r3[3]*/
-
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load next input r1*/
-
-                /*  r2, r3, mul w7, get out r0, r1 */
-                "fmla   v15.4s ,  %[w7].4s,  v4.s[1]\n" /* outr00 = w7 * r2[1]*/
-                "fmla   v16.4s ,  %[w7].4s,  v4.s[2]\n" /* outr01 = w7 * r2[2]*/
-                "fmla   v17.4s ,  %[w7].4s,  v4.s[3]\n" /* outr02 = w7 * r2[3]*/
-                "fmla   v18.4s ,  %[w7].4s,  v5.s[0]\n" /* outr03 = w7 * r2[4]*/
-                "fmla   v19.4s ,  %[w7].4s,  v6.s[1]\n" /* outr10 = w7 * r3[1]*/
-                "fmla   v20.4s ,  %[w7].4s,  v6.s[2]\n" /* outr11 = w7 * r3[2]*/
-                "fmla   v21.4s ,  %[w7].4s,  v6.s[3]\n" /* outr12 = w7 * r3[3]*/
-                "fmla   v22.4s ,  %[w7].4s,  v7.s[0]\n" /* outr13 = w7 * r3[4]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                /*  r2, r3, mul w8, get out r0, r1 */
-                "fmla   v15.4s ,  %[w8].4s,  v4.s[2]\n" /* outr00 = w8 * r2[2]*/
-                "fmla   v16.4s ,  %[w8].4s,  v4.s[3]\n" /* outr01 = w8 * r2[3]*/
-                "fmla   v17.4s ,  %[w8].4s,  v5.s[0]\n" /* outr02 = w8 * r2[0]*/
-                "fmla   v18.4s ,  %[w8].4s,  v5.s[1]\n" /* outr03 = w8 * r2[1]*/
-
-                "stp    q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/
-                "fmla   v19.4s ,  %[w8].4s,  v6.s[2]\n" /* outr10 = w8 * r3[2]*/
-                "stp    q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/
-                "fmla   v20.4s ,  %[w8].4s,  v6.s[3]\n" /* outr11 = w8 * r3[3]*/
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "fmla   v21.4s ,  %[w8].4s,  v7.s[0]\n" /* outr12 = w8 * r3[0]*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-                "fmla   v22.4s ,  %[w8].4s,  v7.s[1]\n" /* outr13 = w8 * r3[1]*/
-                "stp    q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/
-                "stp    q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            const float *wc0 = weight_c + i * w_stride_chin;
-
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                /* load weights */
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                /* load r0, r1 */
-                "vld1.32    {d0-d1}, [%[r0]]!                       @ load r0, "
-                "4 float\n"
-                "vld1.32    {d2}, [%[r0]]                           @ load r0, "
-                "2 float\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                /* main loop */
-                "0:                                                 @ main "
-                "loop\n"
-                /* mul r0 with w0, w1, w2, get out r0 */
-                "vld1.32    {d24-d27}, [%[ptr_out1]]!               @ load "
-                "outr1, w0, w1, c0~c3\n"
-                "vmla.f32   q8, q5, d0[0]                           @ w0 * "
-                "inr00\n"
-                "vld1.32    {d28-d31}, [%[ptr_out1]]                @ load "
-                "outr1, w2, w3, c0~c3\n"
-                "vmla.f32   q9, q5, d0[1]                           @ w0 * "
-                "inr01\n"
-                "vmla.f32   q10, q5, d1[0]                          @ w0 * "
-                "inr02\n"
-                "vmla.f32   q11, q5, d1[1]                          @ w0 * "
-                "inr03\n"
-                "vld1.32    {d3-d4}, [%[r1]]!                       @ load r1, "
-                "4 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w1 * "
-                "inr01\n"
-                "vmla.f32   q9, q6, d1[0]                           @ w1 * "
-                "inr02\n"
-                "vmla.f32   q10, q6, d1[1]                          @ w1 * "
-                "inr03\n"
-                "vmla.f32   q11, q6, d2[0]                          @ w1 * "
-                "inr04\n"
-                "vld1.32    {d5}, [%[r1]]                           @ load r0, "
-                "2 float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w2 * "
-                "inr02\n"
-                "vmla.f32   q9, q7, d1[1]                           @ w2 * "
-                "inr03\n"
-                "vmla.f32   q10, q7, d2[0]                          @ w2 * "
-                "inr04\n"
-                "vmla.f32   q11, q7, d2[1]                          @ w2 * "
-                "inr05\n"
-
-                "sub    %[ptr_out1], %[ptr_out1], #32               @ ptr_out1 "
-                "- 32, to start address\n"
-
-                /* mul r1 with w0, w1, w2, get out r1 */
-                "vmla.f32   q12, q5, d3[0]                          @ w0 * "
-                "inr10\n"
-                "vmla.f32   q13, q5, d3[1]                          @ w0 * "
-                "inr11\n"
-                "vmla.f32   q14, q5, d4[0]                          @ w0 * "
-                "inr12\n"
-                "vmla.f32   q15, q5, d4[1]                          @ w0 * "
-                "inr13\n"
-                "vmla.f32   q12, q6, d3[1]                          @ w1 * "
-                "inr11\n"
-                "vmla.f32   q13, q6, d4[0]                          @ w1 * "
-                "inr12\n"
-                "vmla.f32   q14, q6, d4[1]                          @ w1 * "
-                "inr13\n"
-                "vmla.f32   q15, q6, d5[0]                          @ w1 * "
-                "inr14\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w3, "
-                "w4, to q5, q6\n"
-                "vmla.f32   q12, q7, d4[0]                          @ w2 * "
-                "inr12\n"
-                "vmla.f32   q13, q7, d4[1]                          @ w2 * "
-                "inr13\n"
-                "vmla.f32   q14, q7, d5[0]                          @ w2 * "
-                "inr14\n"
-                "vmla.f32   q15, q7, d5[1]                          @ w2 * "
-                "inr15\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w5, "
-                "to q7\n"
-
-                /* mul r1 with w3, w4, w5, get out r0 */
-                "vmla.f32   q8, q5, d3[0]                           @ w3 * "
-                "inr10\n"
-                "vmla.f32   q9, q5, d3[1]                           @ w3 * "
-                "inr11\n"
-                "vmla.f32   q10, q5, d4[0]                          @ w3 * "
-                "inr12\n"
-                "vmla.f32   q11, q5, d4[1]                          @ w3 * "
-                "inr13\n"
-                "vld1.32    {d0-d1}, [%[r2]]!                       @ load r2, "
-                "4 float\n"
-                "vmla.f32   q8, q6, d3[1]                           @ w4 * "
-                "inr11\n"
-                "vmla.f32   q9, q6, d4[0]                           @ w4 * "
-                "inr12\n"
-                "vmla.f32   q10, q6, d4[1]                          @ w4 * "
-                "inr13\n"
-                "vmla.f32   q11, q6, d5[0]                          @ w4 * "
-                "inr14\n"
-                "vld1.32    {d2}, [%[r2]]                           @ load r2, "
-                "2 float\n"
-                "vmla.f32   q8, q7, d4[0]                           @ w5 * "
-                "inr12\n"
-                "vmla.f32   q9, q7, d4[1]                           @ w5 * "
-                "inr13\n"
-                "vmla.f32   q10, q7, d5[0]                          @ w5 * "
-                "inr14\n"
-                "vmla.f32   q11, q7, d5[1]                          @ w5 * "
-                "inr15\n"
-
-                /* mul r2 with w3, w4, w5, get out r1 */
-                "vmla.f32   q12, q5, d0[0]                          @ w3 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d0[1]                          @ w3 * "
-                "inr21\n"
-                "vmla.f32   q14, q5, d1[0]                          @ w3 * "
-                "inr22\n"
-                "vmla.f32   q15, q5, d1[1]                          @ w3 * "
-                "inr23\n"
-                "vmla.f32   q12, q6, d0[1]                          @ w4 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d1[0]                          @ w4 * "
-                "inr22\n"
-                "vmla.f32   q14, q6, d1[1]                          @ w4 * "
-                "inr23\n"
-                "vmla.f32   q15, q6, d2[0]                          @ w4 * "
-                "inr24\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w6, "
-                "w7, to q5, q6\n"
-                "vmla.f32   q12, q7, d1[0]                          @ w5 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d1[1]                          @ w5 * "
-                "inr23\n"
-                "vmla.f32   q14, q7, d2[0]                          @ w5 * "
-                "inr24\n"
-                "vmla.f32   q15, q7, d2[1]                          @ w5 * "
-                "inr25\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w8, "
-                "to q7\n"
-
-                "sub    %[wc0], %[wc0], #144                        @ wc0 - "
-                "144 to start address\n"
-
-                /* mul r2 with w6, w7, w8, get out r0 */
-                "vmla.f32   q8, q5, d0[0]                           @ w6 * "
-                "inr20\n"
-                "vmla.f32   q9, q5, d0[1]                           @ w6 * "
-                "inr21\n"
-                "vld1.32    {d3-d4}, [%[r3]]!                       @ load r3, "
-                "4 float\n"
-                "vmla.f32   q10, q5, d1[0]                          @ w6 * "
-                "inr22\n"
-                "vmla.f32   q11, q5, d1[1]                          @ w6 * "
-                "inr23\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w7 * "
-                "inr21\n"
-                "vmla.f32   q9, q6, d1[0]                           @ w7 * "
-                "inr22\n"
-                "vld1.32    {d5}, [%[r3]]                           @ load r3, "
-                "2 float\n"
-                "vmla.f32   q10, q6, d1[1]                          @ w7 * "
-                "inr23\n"
-                "vmla.f32   q11, q6, d2[0]                          @ w7 * "
-                "inr24\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w8 * "
-                "inr22\n"
-                "vmla.f32   q9, q7, d1[1]                           @ w8 * "
-                "inr23\n"
-                "vld1.32    {d0-d1}, [%[r0]]!                       @ load r0, "
-                "4 float\n"
-                "vmla.f32   q10, q7, d2[0]                          @ w8 * "
-                "inr24\n"
-                "vmla.f32   q11, q7, d2[1]                          @ w8 * "
-                "inr25\n"
-                "vld1.32    {d2}, [%[r0]]                           @ load r0, "
-                "2 float\n"
-
-                /* mul r3 with w6, w7, w8, get out r1 */
-                "vmla.f32   q12, q5, d3[0]                          @ w6 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d3[1]                          @ w6 * "
-                "inr21\n"
-                "vst1.32    {d16-d19}, [%[ptr_out0]]!               @ save "
-                "r00, r01, c0~c3\n"
-                "vmla.f32   q14, q5, d4[0]                          @ w6 * "
-                "inr22\n"
-                "vmla.f32   q15, q5, d4[1]                          @ w6 * "
-                "inr23\n"
-                "vst1.32    {d20-d23}, [%[ptr_out0]]!               @ save "
-                "r02, r03, c0~c3\n"
-                "vmla.f32   q12, q6, d3[1]                          @ w7 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d4[0]                          @ w7 * "
-                "inr22\n"
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vmla.f32   q14, q6, d4[1]                          @ w7 * "
-                "inr23\n"
-                "vmla.f32   q15, q6, d5[0]                          @ w7 * "
-                "inr24\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vmla.f32   q12, q7, d4[0]                          @ w8 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d4[1]                          @ w8 * "
-                "inr23\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-                "vmla.f32   q14, q7, d5[0]                          @ w8 * "
-                "inr24\n"
-                "vmla.f32   q15, q7, d5[1]                          @ w8 * "
-                "inr25\n"
-
-                "vst1.32    {d24-d27}, [%[ptr_out1]]!               @ save "
-                "r10, r11, c0~c3\n"
-                "vst1.32    {d28-d31}, [%[ptr_out1]]!               @ save "
-                "r12, r13, c0~c3\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                "subs   %[cnt], #1                                  @ loop "
-                "count--\n"
-                "bne    0b                                          @ jump to "
-                "main loop\n"
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1), [wc0] "+r"(wc0)
-                :
-                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                  "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr3;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-        }
-        slidingwindow_writeout_c4_fp32(pre_out, dout_batch, c, c + hout_c_block,
-                                       h, h + h_kernel, 0, wout_round, chout,
-                                       hout, wout, relu, ptr_write);
-      }
-      const float *weight_remain_ptr = weights + c_round_down * w_stride;
-#pragma omp parallel for
-      for (int c = 0; c < c_remain; ++c) {
-#ifdef USE_OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-
-        int c_idx = c_round_down + c;
-
-        int h_kernel = hout_r_block;
-        if (h + hout_r_block > hout) {
-          h_kernel = hout - h;
-        }
-
-        const float *block_inr0 = pre_din;
-        const float *block_inr1 = block_inr0 + in_len;
-        const float *block_inr2 = block_inr1 + in_len;
-        const float *block_inr3 = block_inr2 + in_len;
-
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c_idx;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_remain_ptr;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-
-          float *pre_out0 = pre_out + hk * wout_round;
-          float *pre_out1 = pre_out0 + wout_round;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vdupq_n_f32(wc0[c]);       // w0, v23
-            float32x4_t w1 = vdupq_n_f32(wc0[4 + c]);   // w1, v24
-            float32x4_t w2 = vdupq_n_f32(wc0[8 + c]);   // w2, v25
-            float32x4_t w3 = vdupq_n_f32(wc0[12 + c]);  // w3, v26
-            float32x4_t w4 = vdupq_n_f32(wc0[16 + c]);  // w4, v27
-            float32x4_t w5 = vdupq_n_f32(wc0[20 + c]);  // w5, v28
-            float32x4_t w6 = vdupq_n_f32(wc0[24 + c]);  // w6, v29
-            float32x4_t w7 = vdupq_n_f32(wc0[28 + c]);  // w7, v30
-            float32x4_t w8 = vdupq_n_f32(wc0[32 + c]);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0, w0~w3*/
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr1, w0~w3*/
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r1*/
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-                "2:                                 \n" /* main loop*/
-
-                "fmla   v21.4s ,  %[w0].4s,  v0.4s  \n" /* outr0 = w0 * r0*/
-                "fmla   v22.4s ,  %[w0].4s,  v2.4s  \n" /* outr1 = w0 * r1*/
-
-                "ext    v8.16b,  v0.16b,  v1.16b, #4   \n" /* shift r0 left 1*/
-                "ext    v10.16b,  v2.16b,  v3.16b, #4  \n" /* shift r1 left 1*/
-                "ext    v9.16b,  v0.16b,  v1.16b, #8   \n" /* shift r0 left 2*/
-                "ext    v11.16b,  v2.16b,  v3.16b, #8  \n" /* shift r1 left 2*/
-
-                "ldp    q0, q1,   [%[r0]], #16      \n" /* load input r0*/
-
-                "fmla   v21.4s ,  %[w1].4s,  v8.4s  \n" /* outr0 = w1 * r1*/
-                "fmla   v22.4s ,  %[w1].4s,  v10.4s \n" /* outr1 = w1 * r2*/
-
-                "fmla   v21.4s ,  %[w2].4s,  v9.4s  \n" /* outr0 = w2 * r1*/
-                "fmla   v22.4s ,  %[w2].4s,  v11.4s \n" /* outr1 = w2 * r2*/
-
-                "fmla   v21.4s ,  %[w3].4s,  v2.4s  \n" /* outr0 = w3 * r1*/
-                "fmla   v22.4s ,  %[w3].4s,  v4.4s  \n" /* outr1 = w3 * r2*/
-
-                "ext    v12.16b,  v4.16b,  v5.16b, #4\n" /* shift r2 left 1*/
-                "ext    v14.16b,  v6.16b,  v7.16b, #4\n" /* shift r3 left 1*/
-                "ext    v13.16b,  v4.16b,  v5.16b, #8\n" /* shift r2 left 2*/
-                "ext    v15.16b,  v6.16b,  v7.16b, #8\n" /* shift r3 left 2*/
-
-                "fmla   v21.4s ,  %[w4].4s,  v10.4s \n" /* outr0 = w4 * r1*/
-                "fmla   v22.4s ,  %[w4].4s,  v12.4s \n" /* outr1 = w4 * r2*/
-
-                "fmla   v21.4s ,  %[w5].4s,  v11.4s \n" /* outr0 = w5 * r1*/
-                "fmla   v22.4s ,  %[w5].4s,  v13.4s \n" /* outr1 = w5 * r2*/
-
-                "ldp    q2, q3,   [%[r1]], #16      \n" /* load input r0*/
-
-                "fmla   v21.4s ,  %[w6].4s,  v4.4s  \n" /* outr0 = w6 * r2*/
-                "fmla   v22.4s ,  %[w6].4s,  v6.4s  \n" /* outr1 = w6 * r3*/
-
-                "ldp    q4, q5,   [%[r2]], #16      \n" /* load input r2*/
-
-                "fmla   v21.4s ,  %[w7].4s,  v12.4s \n" /* outr0 = w7 * r1*/
-                "fmla   v22.4s ,  %[w7].4s,  v14.4s \n" /* outr1 = w7 * r2*/
-
-                "ldp    q6, q7,   [%[r3]], #16      \n" /* load input r3*/
-
-                "fmla   v21.4s ,  %[w8].4s,  v13.4s \n" /* outr0 = w8 * r1*/
-                "fmla   v22.4s ,  %[w8].4s,  v15.4s \n" /* outr1 = w8 * r2*/
-
-                "str    q21,    [%[ptr_out0]], #16  \n" /*write output r0*/
-                "str    q22,    [%[ptr_out1]], #16  \n" /*write output r1*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0, w0~w3*/
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr1, w0~w3*/
-
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-                  "v21", "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float w_tmp[10] = {
-                wc0[c],      wc0[c + 4],  wc0[c + 8],  wc0[c + 12], wc0[c + 16],
-                wc0[c + 20], wc0[c + 24], wc0[c + 28], wc0[c + 32], 0.f};
-            float32x4_t w0 = vld1q_f32(w_tmp);      // w0, w1, w2, q0
-            float32x4_t w1 = vld1q_f32(w_tmp + 3);  // w3, w4, w5, q1
-            float32x4_t w2 = vld1q_f32(w_tmp + 6);  // w6, w7, w8, q2
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            int cnt = w_loop / 2;
-            if (cnt > 0) {
-              asm volatile(
-                  "vld1.32    {d24-d27},    [%[ptr_out0]]         @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d6-d9},      [%[r0]]!              @ load r0, 8 "
-                  "float\n"
-                  "vld1.32    {d10},        [%[r0]]               @ load r0, 2 "
-                  "float\n"
-                  /* main loop */
-                  "0:                                             @ main loop\n"
-                  /* r0 * w0, w1, w2, get out r0*/
-                  "vld1.32    {d28-d31},    [%[ptr_out1]]         @ load or10, "
-                  "or11\n"
-                  "vext.32    q8, q3, q4, #1                      @ r0, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r0, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q12,    q3, %e[w0][0]               @ w00 * r0, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w0][0]               @ w00 * r0, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r0, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r0, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w0][1]               @ w01 * r0, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w0][1]               @ w01 * r0, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r1]]!                @ load r1, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w0][0]              @ w02 * r0, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w0][0]              @ w02 * r0, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},       [%[r1]]                @ load r1, 2 "
-                  "float\n"
-
-                  /* r1 * w3, w4, w5, get out r0*/
-                  /* r1 * w0, w1, w2, get out r1*/
-                  "vmla.f32   q12,    q3, %e[w1][0]               @ w10 * r1, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w1][0]               @ w10 * r1, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q8, q3, q4, #1                      @ r1, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r1, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w0][0]               @ w00 * r1, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w0][0]               @ w00 * r1, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r1, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r1, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w1][1]               @ w11 * r1, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w1][1]               @ w11 * r1, "
-                  "5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q8, %e[w0][1]               @ w01 * r1, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q15,    q9, %e[w0][1]               @ w01 * r1, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r2]]!                @ load r2, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w1][0]              @ w12 * r1, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w1][0]              @ w12 * r1, "
-                  "6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q10, %f[w0][0]              @ w02 * r1, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w0][0]              @ w02 * r1, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r2]]                   @ load r2, 2 "
-                  "float\n"
-
-                  /* r2 * w6, w7, w8, get out r0*/
-                  /* r2 * w3, w4, w5, get out r1*/
-                  "vmla.f32   q12,    q3, %e[w2][0]               @ w20 * r2, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q13,    q4, %e[w2][0]               @ w20 * r2, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q8, q3, q4, #1                      @ r2, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r2, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w1][0]               @ w10 * r2, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w1][0]               @ w10 * r2, "
-                  "4, 5, 6, 7\n"
-                  "vext.32    q10, q3, q4, #2                     @ r2, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r2, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q12,    q8, %e[w2][1]               @ w21 * r2, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q13,    q9, %e[w2][1]               @ w21 * r2, "
-                  "5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q8, %e[w1][1]               @ w11 * r2, "
-                  "1, 2, 3, 4\n"
-                  "vmla.f32   q15,    q9, %e[w1][1]               @ w11 * r2, "
-                  "5, 6, 7, 8\n"
-                  "vld1.32    {d6-d9},    [%[r3]]!                @ load r3, 8 "
-                  "float\n"
-                  "vmla.f32   q12,    q10, %f[w2][0]              @ w22 * r2, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q13,    q11, %f[w2][0]              @ w22 * r2, "
-                  "6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q10, %f[w1][0]              @ w12 * r2, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w1][0]              @ w12 * r2, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r3]]                   @ load r3, 2 "
-                  "float\n"
-
-                  /* r3 * w6, w7, w8, get out r1*/
-                  "vext.32    q8, q3, q4, #1                      @ r3, shift "
-                  "left 1, get 1, 2, 3, 4\n"
-                  "vext.32    q9, q4, q5, #1                      @ r3, shift "
-                  "left 1, get 5, 6, 7, 8\n"
-                  "vmla.f32   q14,    q3, %e[w2][0]               @ w20 * r3, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q4, %e[w2][0]               @ w20 * r3, "
-                  "4, 5, 6, 7\n"
-                  "vst1.32    {d24-d27},  [%[ptr_out0]]!          @ save or00, "
-                  "or01\n"
-                  "vext.32    q10, q3, q4, #2                     @ r3, shift "
-                  "left 2, get 2, 3, 4, 5\n"
-                  "vext.32    q11, q4, q5, #2                     @ r3, shift "
-                  "left 2, get 6, 7, 8, 9\n"
-                  "vmla.f32   q14,    q8, %e[w2][1]               @ w21 * r3, "
-                  "0, 1, 2, 3\n"
-                  "vmla.f32   q15,    q9, %e[w2][1]               @ w21 * r3, "
-                  "4, 5, 6, 7\n"
-                  "vld1.32    {d24-d27},  [%[ptr_out0]]           @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d6-d9},    [%[r0]]!                @ load r3, 8 "
-                  "float\n"
-                  "vmla.f32   q14,    q10, %f[w2][0]              @ w22 * r3, "
-                  "2, 3, 4, 5\n"
-                  "vmla.f32   q15,    q11, %f[w2][0]              @ w22 * r3, "
-                  "6, 7, 8, 9\n"
-                  "vld1.32    {d10},    [%[r0]]                   @ load r0, 2 "
-                  "float\n"
-                  "vst1.32    {d28-d31},  [%[ptr_out1]]!          @ save or10, "
-                  "or11\n"
-
-                  "subs   %[cnt], #1                              @loop count "
-                  "-1\n"
-                  "bne    0b                                      @ jump to "
-                  "main loop\n"
-
-                  : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1),
-                    [r2] "+r"(r2), [r3] "+r"(r3), [ptr_out0] "+r"(ptr_out0),
-                    [ptr_out1] "+r"(ptr_out1)
-                  : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2)
-                  : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                    "q10", "q11", "q12", "q13", "q14", "q15");
-              r0 -= 8;
-            }
-            //! deal with remain wout
-            if (w_loop & 1) {
-              ptr_out0[0] +=
-                  r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] +
-                  r1[0] * w_tmp[3] + r1[1] * w_tmp[4] + r1[2] * w_tmp[5] +
-                  r2[0] * w_tmp[6] + r2[1] * w_tmp[7] + r2[2] * w_tmp[8];
-
-              ptr_out0[1] +=
-                  r0[1] * w_tmp[0] + r0[2] * w_tmp[1] + r0[3] * w_tmp[2] +
-                  r1[1] * w_tmp[3] + r1[2] * w_tmp[4] + r1[3] * w_tmp[5] +
-                  r2[1] * w_tmp[6] + r2[2] * w_tmp[7] + r2[3] * w_tmp[8];
-
-              ptr_out0[2] +=
-                  r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] +
-                  r1[2] * w_tmp[3] + r1[3] * w_tmp[4] + r1[4] * w_tmp[5] +
-                  r2[2] * w_tmp[6] + r2[3] * w_tmp[7] + r2[4] * w_tmp[8];
-
-              ptr_out0[3] +=
-                  r0[3] * w_tmp[0] + r0[4] * w_tmp[1] + r0[5] * w_tmp[2] +
-                  r1[3] * w_tmp[3] + r1[4] * w_tmp[4] + r1[5] * w_tmp[5] +
-                  r2[3] * w_tmp[6] + r2[4] * w_tmp[7] + r2[5] * w_tmp[8];
-
-              ptr_out1[0] +=
-                  r1[0] * w_tmp[0] + r1[1] * w_tmp[1] + r1[2] * w_tmp[2] +
-                  r2[0] * w_tmp[3] + r2[1] * w_tmp[4] + r2[2] * w_tmp[5] +
-                  r3[0] * w_tmp[6] + r3[1] * w_tmp[7] + r3[2] * w_tmp[8];
-
-              ptr_out1[1] +=
-                  r1[1] * w_tmp[0] + r1[2] * w_tmp[1] + r1[3] * w_tmp[2] +
-                  r2[1] * w_tmp[3] + r2[2] * w_tmp[4] + r2[3] * w_tmp[5] +
-                  r3[1] * w_tmp[6] + r3[2] * w_tmp[7] + r3[3] * w_tmp[8];
-
-              ptr_out1[2] +=
-                  r1[2] * w_tmp[0] + r1[3] * w_tmp[1] + r1[4] * w_tmp[2] +
-                  r2[2] * w_tmp[3] + r2[3] * w_tmp[4] + r2[4] * w_tmp[5] +
-                  r3[2] * w_tmp[6] + r3[3] * w_tmp[7] + r3[4] * w_tmp[8];
-
-              ptr_out1[3] +=
-                  r1[3] * w_tmp[0] + r1[4] * w_tmp[1] + r1[5] * w_tmp[2] +
-                  r2[3] * w_tmp[3] + r2[4] * w_tmp[4] + r2[5] * w_tmp[5] +
-                  r3[3] * w_tmp[6] + r3[4] * w_tmp[7] + r3[5] * w_tmp[8];
-            }
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr2;
-          block_inr1 = block_inr3;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-        }
-        slidingwindow_writeout_c1_fp32(pre_out, dout_batch, c_idx, c_idx + 1, h,
-                                       h + h_kernel, 0, wout_round, chout, hout,
-                                       wout, relu, ptr_write);
-      }
-    }
-  }
-}
-
-template <>
-void SlidingwindowConv3x3s2Faster<float, float>(
-    const framework::Tensor *input, framework::Tensor *filter,
-    const std::vector<int> &paddings, framework::Tensor *output,
-    const float *bias, bool is_bias, bool is_relu) {
-  const float *din = input->data<float>();
-  float *dout = output->mutable_data<float>();
-  const float *weights = filter->mutable_data<float>();
-  if (!is_bias) {
-    bias = nullptr;
-  }
-  bool relu = is_relu;
-  const int num = input->dims()[0];
-  const int chin = input->dims()[1];
-  const int hin = input->dims()[2];
-  const int win = input->dims()[3];
-  const int chout = output->dims()[1];
-  const int hout = output->dims()[2];
-  const int wout = output->dims()[3];
-  const int pad_h = paddings[0];
-  const int pad_w = paddings[1];
-  const int threads = framework::CPUContext::Context()->get_thread_num();
-  int l2_size =
-      framework::CPUContext::Context()->get_l2_cache_size() / sizeof(float);
-  const int hout_c_block = 4;
-  const int hout_r_kernel = 2;
-  const int wout_block = 4;
-  const int wout_round = ((wout + wout_block - 1) / wout_block) * wout_block;
-  const int win_round = wout_round * 2 /*stride_w*/ + 1;
-  //! get h block
-  //! win_round * chin * hin_r_block + wout_round * hout_c_block * hout_r_block
-  //! * threads = l2_size win_round = 2 * wout_round + 1 hin_r_block = 2 *
-  //! hout_r_block + 1
-  int hout_r_block =
-      (l2_size - 2 * wout_round * chin - chin) /
-      ((4 * wout_round + 2) * chin + wout_round * hout_c_block * threads);
-  hout_r_block = hout_r_block > hout ? hout : hout_r_block;
-  hout_r_block = (hout_r_block / hout_r_kernel) * hout_r_kernel;
-  hout_r_block = hout_r_block < hout_r_kernel ? hout_r_kernel : hout_r_block;
-
-  const int hin_r_block = hout_r_block * 2 /*stride_h*/ + 1;
-
-  float ptr_zero[win_round];
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float ptr_write[wout_round];
-
-  int in_len = win_round * chin;
-  int pre_in_size = hin_r_block * in_len;
-  int pre_out_size = hout_c_block * hout_r_block * wout_round;
-
-  float *pre_din =
-      static_cast<float *>(framework::CPUContext::Context()->get_work_space(
-          (pre_in_size + threads * pre_out_size) * sizeof(float)));
-
-  int size_in_channel = win * hin;
-  int size_out_channel = wout * hout;
-  int w_stride = chin * 9;               /*kernel_w * kernel_h*/
-  int w_stride_chin = hout_c_block * 9;  // kernel_w * kernel_h *
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int w_loop = wout_round / 4;
-
-  int c_remain = chout - (chout / hout_c_block) * hout_c_block;
-  int c_round_down = (chout / hout_c_block) * hout_c_block;
-
-  int out_row_stride = hout_c_block * wout_round;
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * chin * size_in_channel;
-    float *dout_batch = dout + n * chout * size_out_channel;
-    for (int h = 0; h < hout; h += hout_r_block) {
-      int h_kernel = hout_r_block;
-      if (h + hout_r_block > hout) {
-        h_kernel = hout - h;
-      }
-
-      int hs = h * 2 /*stride_h*/ - pad_h;
-      int he = hs + h_kernel * 2 /*stride_h*/ + 1;
-
-      slidingwindow_prepack_input(din_batch, pre_din, 0, chin, hs, he, ws, we,
-                                  chin, win, hin, ptr_zero);
-
-      const float *cblock_inr0 = pre_din;
-      const float *cblock_inr1 = cblock_inr0 + in_len;
-      const float *cblock_inr2 = cblock_inr1 + in_len;
-      const float *cblock_inr3 = cblock_inr2 + in_len;
-      const float *cblock_inr4 = cblock_inr3 + in_len;
-
-#pragma omp parallel for
-      for (int c = 0; c < c_round_down; c += hout_c_block) {
-#ifdef _OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-        const float *block_inr0 = cblock_inr0;
-        const float *block_inr1 = cblock_inr1;
-        const float *block_inr2 = cblock_inr2;
-        const float *block_inr3 = cblock_inr3;
-        const float *block_inr4 = cblock_inr4;
-
-        const float *weight_c = weights + c * w_stride;
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr,
-                                wout_round * hout_c_block * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_c;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-          const float *inr4 = block_inr4;
-
-          float *pre_out0 = pre_out + hk * out_row_stride;
-          float *pre_out1 = pre_out0 + out_row_stride;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            float32x4_t w0 = vld1q_f32(wc0);       // w0, v23
-            float32x4_t w1 = vld1q_f32(wc0 + 4);   // w1, v24
-            float32x4_t w2 = vld1q_f32(wc0 + 8);   // w2, v25
-            float32x4_t w3 = vld1q_f32(wc0 + 12);  // w3, v26
-            float32x4_t w4 = vld1q_f32(wc0 + 16);  // w4, v27
-            float32x4_t w5 = vld1q_f32(wc0 + 20);  // w5, v28
-            float32x4_t w6 = vld1q_f32(wc0 + 24);  // w6, v29
-            float32x4_t w7 = vld1q_f32(wc0 + 28);  // w7, v30
-            float32x4_t w8 = vld1q_f32(wc0 + 32);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-
-                "ldp    q0, q1,   [%[r0]], #32      \n" /* load input r0*/
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-                "ldp    q4, q5,   [%[r2]], #32      \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r2, mul w0, get out r0, r1 */
-                "ldp    q19, q20, [%[ptr_out1]]     \n" /* load outr10, outr11*/
-                "ldp    q21, q22, [%[ptr_out1], #32]\n" /* load outr12, outr13*/
-                "fmla   v15.4s ,  %[w0].4s,  v0.s[0]\n" /* outr00 = w0 * r0[0]*/
-                "fmla   v16.4s ,  %[w0].4s,  v0.s[2]\n" /* outr01 = w0 * r0[2]*/
-                "fmla   v17.4s ,  %[w0].4s,  v1.s[0]\n" /* outr02 = w0 * r0[4]*/
-                "fmla   v18.4s ,  %[w0].4s,  v1.s[2]\n" /* outr03 = w0 * r0[6]*/
-                "fmla   v19.4s ,  %[w0].4s,  v4.s[0]\n" /* outr10 = w0 * r2[0]*/
-                "fmla   v20.4s ,  %[w0].4s,  v4.s[2]\n" /* outr11 = w0 * r2[2]*/
-                "fmla   v21.4s ,  %[w0].4s,  v5.s[0]\n" /* outr12 = w0 * r2[4]*/
-                "fmla   v22.4s ,  %[w0].4s,  v5.s[2]\n" /* outr13 = w0 * r2[6]*/
-
-                "ldp    q2, q3,   [%[r1]], #32      \n" /* load input r1*/
-
-                /* r2 mul w6, get out r0*/
-                "fmla   v15.4s ,  %[w6].4s,  v4.s[0]\n" /* outr00 = w6 * r2[0]*/
-                "fmla   v16.4s ,  %[w6].4s,  v4.s[2]\n" /* outr01 = w6 * r2[2]*/
-                "fmla   v17.4s ,  %[w6].4s,  v5.s[0]\n" /* outr02 = w6 * r2[4]*/
-                "fmla   v18.4s ,  %[w6].4s,  v5.s[2]\n" /* outr03 = w6 * r2[6]*/
-
-                "ldr    d11,      [%[r1]]           \n" /* load input r1, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w1, get out r0, r1 */
-                "fmla   v15.4s ,  %[w1].4s,  v0.s[1]\n" /* outr00 = w1 * r0[1]*/
-                "fmla   v16.4s ,  %[w1].4s,  v0.s[3]\n" /* outr01 = w1 * r0[3]*/
-                "fmla   v17.4s ,  %[w1].4s,  v1.s[1]\n" /* outr02 = w1 * r0[5]*/
-                "fmla   v18.4s ,  %[w1].4s,  v1.s[3]\n" /* outr03 = w1 * r0[7]*/
-                "fmla   v19.4s ,  %[w1].4s,  v4.s[1]\n" /* outr10 = w1 * r2[1]*/
-                "fmla   v20.4s ,  %[w1].4s,  v4.s[3]\n" /* outr11 = w1 * r2[3]*/
-                "fmla   v21.4s ,  %[w1].4s,  v5.s[1]\n" /* outr12 = w1 * r2[5]*/
-                "fmla   v22.4s ,  %[w1].4s,  v5.s[3]\n" /* outr13 = w1 * r2[7]*/
-
-                "ldp    q6, q7,   [%[r3]], #32      \n" /* load input r3*/
-
-                /*  r2 mul w7, get out r0 */
-                "fmla   v15.4s ,  %[w7].4s,  v4.s[1]\n" /* outr00 = w7 * r2[1]*/
-                "fmla   v16.4s ,  %[w7].4s,  v4.s[3]\n" /* outr01 = w7 * r2[3]*/
-                "fmla   v17.4s ,  %[w7].4s,  v5.s[1]\n" /* outr02 = w7 * r2[5]*/
-                "fmla   v18.4s ,  %[w7].4s,  v5.s[3]\n" /* outr03 = w7 * r2[7]*/
-
-                "ldr    d13,      [%[r3]]           \n" /* load input r3, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w2, get out r0, r1 */
-                "fmla   v15.4s ,  %[w2].4s,  v0.s[2]\n" /* outr00 = w2 * r0[2]*/
-                "fmla   v16.4s ,  %[w2].4s,  v1.s[0]\n" /* outr01 = w2 * r0[4]*/
-                "fmla   v17.4s ,  %[w2].4s,  v1.s[2]\n" /* outr02 = w2 * r0[6]*/
-                "fmla   v18.4s ,  %[w2].4s,  v10.s[0]\n" /* outr03 = w2 *
-                                                            r0[8]*/
-                "fmla   v19.4s ,  %[w2].4s,  v4.s[2]\n" /* outr10 = w2 * r2[2]*/
-                "fmla   v20.4s ,  %[w2].4s,  v5.s[0]\n" /* outr11 = w2 * r2[4]*/
-                "fmla   v21.4s ,  %[w2].4s,  v5.s[2]\n" /* outr12 = w2 * r2[6]*/
-                "fmla   v22.4s ,  %[w2].4s,  v12.s[0]\n" /* outr13 = w2 *
-                                                            r2[8]*/
-
-                "ldp    q8, q9,   [%[r4]], #32      \n" /* load input r4*/
-
-                /*  r2, mul w8, get out r0 */
-                "fmla   v15.4s ,  %[w8].4s,  v4.s[2]\n" /* outr00 = w8 * r2[2]*/
-                "fmla   v16.4s ,  %[w8].4s,  v5.s[0]\n" /* outr01 = w8 * r2[4]*/
-                "fmla   v17.4s ,  %[w8].4s,  v5.s[2]\n" /* outr02 = w8 * r2[6]*/
-                "fmla   v18.4s ,  %[w8].4s,  v12.s[0]\n" /* outr03 = w8 *
-                                                            r2[8]*/
-
-                "ldr    d14,      [%[r4]]           \n" /* load input r4, 9th
-                                                           element*/
-
-                /* r1, r3, mul w3, get out r0, r1 */
-                "fmla   v15.4s ,  %[w3].4s,  v2.s[0]\n" /* outr00 = w3 * r1[0]*/
-                "fmla   v16.4s ,  %[w3].4s,  v2.s[2]\n" /* outr01 = w3 * r1[2]*/
-                "fmla   v17.4s ,  %[w3].4s,  v3.s[0]\n" /* outr02 = w3 * r1[4]*/
-                "fmla   v18.4s ,  %[w3].4s,  v3.s[2]\n" /* outr03 = w3 * r1[6]*/
-                "fmla   v19.4s ,  %[w3].4s,  v6.s[0]\n" /* outr10 = w3 * r3[0]*/
-                "fmla   v20.4s ,  %[w3].4s,  v6.s[2]\n" /* outr11 = w3 * r3[2]*/
-                "fmla   v21.4s ,  %[w3].4s,  v7.s[0]\n" /* outr12 = w3 * r3[4]*/
-                "fmla   v22.4s ,  %[w3].4s,  v7.s[2]\n" /* outr13 = w3 * r3[6]*/
-
-                "ldp    q0, q1,   [%[r0]], #32      \n" /* load input r0*/
-
-                /*  r1, r3, mul w4, get out r0, r1 */
-                "fmla   v15.4s ,  %[w4].4s,  v2.s[1]\n" /* outr00 = w4 * r1[1]*/
-                "fmla   v16.4s ,  %[w4].4s,  v2.s[3]\n" /* outr01 = w4 * r1[3]*/
-                "fmla   v17.4s ,  %[w4].4s,  v3.s[1]\n" /* outr02 = w4 * r1[5]*/
-                "fmla   v18.4s ,  %[w4].4s,  v3.s[3]\n" /* outr03 = w4 * r1[7]*/
-                "fmla   v19.4s ,  %[w4].4s,  v6.s[1]\n" /* outr10 = w4 * r3[1]*/
-                "fmla   v20.4s ,  %[w4].4s,  v6.s[3]\n" /* outr11 = w4 * r3[3]*/
-                "fmla   v21.4s ,  %[w4].4s,  v7.s[1]\n" /* outr12 = w4 * r3[5]*/
-                "fmla   v22.4s ,  %[w4].4s,  v7.s[3]\n" /* outr13 = w4 * r3[7]*/
-
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-
-                /*  r1, r3, mul w5, get out r0, r1 */
-                "fmla   v15.4s ,  %[w5].4s,  v2.s[2]\n" /* outr00 = w5 * r1[2]*/
-                "fmla   v16.4s ,  %[w5].4s,  v3.s[0]\n" /* outr01 = w5 * r1[4]*/
-                "fmla   v17.4s ,  %[w5].4s,  v3.s[2]\n" /* outr02 = w5 * r1[6]*/
-                "fmla   v18.4s ,  %[w5].4s,  v11.s[0]\n" /* outr03 = w5 *
-                                                            r1[8]*/
-
-                "ldp    q4, q5,   [%[r2]], #32      \n" /* load input r2*/
-                "stp    q15, q16, [%[ptr_out0]], #32\n" /* save outr00, outr01*/
-
-                "fmla   v19.4s ,  %[w5].4s,  v6.s[2]\n" /* outr10 = w5 * r3[2]*/
-                "fmla   v20.4s ,  %[w5].4s,  v7.s[0]\n" /* outr11 = w5 * r3[4]*/
-                "fmla   v21.4s ,  %[w5].4s,  v7.s[2]\n" /* outr12 = w5 * r3[6]*/
-                "fmla   v22.4s ,  %[w5].4s,  v13.s[0]\n" /* outr13 = w5 *
-                                                            r3[8]*/
-
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "stp    q17, q18, [%[ptr_out0]], #32\n" /* save outr02, outr03*/
-
-                /*  r4, mul w6, get out r1 */
-                "fmla   v19.4s ,  %[w6].4s,  v8.s[0]\n" /* outr10 = w6 * r4[0]*/
-                "fmla   v20.4s ,  %[w6].4s,  v8.s[2]\n" /* outr11 = w6 * r4[2]*/
-                "fmla   v21.4s ,  %[w6].4s,  v9.s[0]\n" /* outr12 = w6 * r4[4]*/
-                "fmla   v22.4s ,  %[w6].4s,  v9.s[2]\n" /* outr13 = w6 * r4[6]*/
-
-                "ldp    q15, q16, [%[ptr_out0]]     \n" /* load outr00, outr01*/
-
-                /*  r4, mul w7, get out r1 */
-                "fmla   v19.4s ,  %[w7].4s,  v8.s[1]\n" /* outr10 = w7 * r4[1]*/
-                "fmla   v20.4s ,  %[w7].4s,  v8.s[3]\n" /* outr11 = w7 * r4[3]*/
-                "fmla   v21.4s ,  %[w7].4s,  v9.s[1]\n" /* outr12 = w7 * r4[5]*/
-                "fmla   v22.4s ,  %[w7].4s,  v9.s[3]\n" /* outr13 = w7 * r4[7]*/
-
-                "ldp    q17, q18, [%[ptr_out0], #32]\n" /* load outr02, outr03*/
-
-                /*  r4, mul w8, get out r1 */
-                "fmla   v19.4s ,  %[w8].4s,  v8.s[2]\n" /* outr10 = w8 * r4[2]*/
-                "fmla   v20.4s ,  %[w8].4s,  v9.s[0]\n" /* outr11 = w8 * r4[4]*/
-                "fmla   v21.4s ,  %[w8].4s,  v9.s[2]\n" /* outr12 = w8 * r4[6]*/
-                "fmla   v22.4s ,  %[w8].4s,  v14.s[0]\n" /* outr13 = w8 *
-                                                            r4[8]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-
-                "stp    q19, q20, [%[ptr_out1]], #32\n" /* save outr10, outr11*/
-                "stp    q21, q22, [%[ptr_out1]], #32\n" /* save outr12, outr13*/
-
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-                  "v16", "v17", "v18", "v19", "v20", "v21", "v22");
-
-            wc0 += 9 * hout_c_block;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            const float *wc0 = weight_c + i * w_stride_chin;
-
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                /* load weights */
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                /* load r0, r2 */
-                "vld1.32    {d0-d3}, [%[r0]]!                       @ load r0, "
-                "8 float\n"
-                "vld1.32    {d8},   [%[r0]]                         @ load r0, "
-                "9th float\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                /* main loop */
-                "0:                                                 @ main "
-                "loop\n"
-                /* mul r0, with w0, w1, w2 */
-                "vld1.32    {d24-d27}, [%[ptr_out1]]!               @ load "
-                "outr1, w0, w1, c0~c3\n"
-                "vmla.f32   q8, q5, d0[0]                           @ w0 * "
-                "inr00\n"
-                "vld1.32    {d28-d31}, [%[ptr_out1]]                @ load "
-                "outr1, w2, w3, c0~c3\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w0 * "
-                "inr02\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w0 * "
-                "inr04\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w0 * "
-                "inr06\n"
-                "vld1.32    {d4-d7}, [%[r2]]!                       @ load r2, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w1 * "
-                "inr01\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w1 * "
-                "inr03\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w1 * "
-                "inr05\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w1 * "
-                "inr07\n"
-                "vld1.32    {d9},   [%[r2]]                         @ load r2, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w2 * "
-                "inr02\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w2 * "
-                "inr04\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w2 * "
-                "inr06\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w2 * "
-                "inr08\n"
-
-                "sub    %[r2], %[r2], #32                           @ r2 - 32, "
-                "load r2 twice\n"
-
-                /* mul r2, with w0, w1, w2 */
-                "vld1.32    {d0-d3}, [%[r1]]!                       @ load r1, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w0 * "
-                "inr20\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w0 * "
-                "inr22\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w0 * "
-                "inr24\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w0 * "
-                "inr26\n"
-                "vld1.32    {d8},   [%[r1]]                         @ load r1, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w1 * "
-                "inr21\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w1 * "
-                "inr23\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w1 * "
-                "inr25\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w1 * "
-                "inr27\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w3, "
-                "w4, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w2 * "
-                "inr22\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w2 * "
-                "inr24\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w2 * "
-                "inr26\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w2 * "
-                "inr28\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w5, "
-                "to q7\n"
-
-                /* mul r1, with w3, w4, w5 */
-                "vmla.f32   q8, q5, d0[0]                           @ w3 * "
-                "inr10\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w3 * "
-                "inr12\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w3 * "
-                "inr14\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w3 * "
-                "inr16\n"
-                "vld1.32    {d4-d7}, [%[r3]]!                       @ load r3, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w4 * "
-                "inr11\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w4 * "
-                "inr13\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w4 * "
-                "inr15\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w4 * "
-                "inr17\n"
-                "vld1.32    {d9},   [%[r3]]                         @ load r3, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w5 * "
-                "inr12\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w5 * "
-                "inr14\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w5 * "
-                "inr16\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w5 * "
-                "inr18\n"
-
-                "sub    %[ptr_out1], %[ptr_out1], #32               @ ptr_out1 "
-                "- 32, to start address\n"
-
-                /* mul r3, with w3, w4, w5 */
-                "vld1.32    {d0-d3}, [%[r2]]!                       @ load r2, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w3 * "
-                "inr30\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w3 * "
-                "inr32\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w3 * "
-                "inr34\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w3 * "
-                "inr36\n"
-                "vld1.32    {d8},   [%[r2]]                         @ load r2, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w4 * "
-                "inr31\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w4 * "
-                "inr33\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w4 * "
-                "inr35\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w4 * "
-                "inr37\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w6, "
-                "w7, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w5 * "
-                "inr32\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w5 * "
-                "inr34\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w5 * "
-                "inr36\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w5 * "
-                "inr38\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w8, "
-                "to q7\n"
-
-                /* mul r2, with w6, w7, w8 */
-                "vmla.f32   q8, q5, d0[0]                           @ w6 * "
-                "inr20\n"
-                "vmla.f32   q9, q5, d1[0]                           @ w6 * "
-                "inr22\n"
-                "vmla.f32   q10, q5, d2[0]                          @ w6 * "
-                "inr24\n"
-                "vmla.f32   q11, q5, d3[0]                          @ w6 * "
-                "inr26\n"
-                "vld1.32    {d4-d7}, [%[r4]]!                       @ load r4, "
-                "8 float\n"
-                "vmla.f32   q8, q6, d0[1]                           @ w7 * "
-                "inr21\n"
-                "vmla.f32   q9, q6, d1[1]                           @ w7 * "
-                "inr23\n"
-                "vmla.f32   q10, q6, d2[1]                          @ w7 * "
-                "inr25\n"
-                "vmla.f32   q11, q6, d3[1]                          @ w7 * "
-                "inr27\n"
-                "vld1.32    {d9},   [%[r4]]                         @ load r4, "
-                "9th float\n"
-                "vmla.f32   q8, q7, d1[0]                           @ w8 * "
-                "inr22\n"
-                "vmla.f32   q9, q7, d2[0]                           @ w8 * "
-                "inr24\n"
-                "vmla.f32   q10, q7, d3[0]                          @ w8 * "
-                "inr26\n"
-                "vmla.f32   q11, q7, d8[0]                          @ w8 * "
-                "inr28\n"
-
-                "sub    %[wc0], %[wc0], #144                        @ wc0 - "
-                "144 to start address\n"
-
-                /* mul r4, with w6, w7, w8 */
-                "vld1.32    {d0-d3}, [%[r0]]!                       @ load r0, "
-                "8 float\n"
-                "vmla.f32   q12, q5, d4[0]                          @ w3 * "
-                "inr40\n"
-                "vst1.32    {d16-d19}, [%[ptr_out0]]!               @ save "
-                "r00, r01, c0~c3\n"
-                "vmla.f32   q13, q5, d5[0]                          @ w3 * "
-                "inr42\n"
-                "vst1.32    {d20-d23}, [%[ptr_out0]]!               @ save "
-                "r02, r03, c0~c3\n"
-                "vmla.f32   q14, q5, d6[0]                          @ w3 * "
-                "inr44\n"
-                "vmla.f32   q15, q5, d7[0]                          @ w3 * "
-                "inr46\n"
-                "vld1.32    {d8},   [%[r0]]                         @ load r0, "
-                "9th float\n"
-                "vmla.f32   q12, q6, d4[1]                          @ w4 * "
-                "inr41\n"
-                "vmla.f32   q13, q6, d5[1]                          @ w4 * "
-                "inr43\n"
-                "vmla.f32   q14, q6, d6[1]                          @ w4 * "
-                "inr45\n"
-                "vmla.f32   q15, q6, d7[1]                          @ w4 * "
-                "inr47\n"
-                "vld1.32    {d10-d13}, [%[wc0]]!                    @ load w0, "
-                "w1, to q5, q6\n"
-                "vmla.f32   q12, q7, d5[0]                          @ w5 * "
-                "inr42\n"
-                "vmla.f32   q13, q7, d6[0]                          @ w5 * "
-                "inr44\n"
-                "vmla.f32   q14, q7, d7[0]                          @ w5 * "
-                "inr46\n"
-                "vmla.f32   q15, q7, d9[0]                          @ w5 * "
-                "inr48\n"
-                "vld1.32    {d14-d15}, [%[wc0]]!                    @ load w2, "
-                "to q7\n"
-
-                "vst1.32    {d24-d27}, [%[ptr_out1]]!               @ save "
-                "r10, r11, c0~c3\n"
-                "vst1.32    {d28-d31}, [%[ptr_out1]]!               @ save "
-                "r12, r13, c0~c3\n"
-
-                "vld1.32    {d16-d19}, [%[ptr_out0]]!               @ load "
-                "outr0, w0, w1, c0~c3\n"
-                "vld1.32    {d20-d23}, [%[ptr_out0]]                @ load "
-                "outr0, w2, w3, c0~c3\n"
-
-                "sub    %[ptr_out0], %[ptr_out0], #32               @ ptr_out0 "
-                "- 32, to start address\n"
-
-                "subs   %[cnt], #1                                  @ loop "
-                "count--\n"
-                "bne    0b                                          @ jump to "
-                "main loop\n"
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1), [wc0] "+r"(wc0)
-                :
-                : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-                  "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-
-        slidingwindow_writeout_c4_fp32(pre_out, dout_batch, c, c + hout_c_block,
-                                       h, h + h_kernel, 0, wout_round, chout,
-                                       hout, wout, relu, ptr_write);
-      }
-
-#pragma omp parallel for
-      for (int c = 0; c < c_remain; ++c) {
-#ifdef USE_OPENMP
-        float *pre_out =
-            pre_din + pre_in_size + omp_get_thread_num() * pre_out_size;
-#else
-        float *pre_out = pre_din + pre_in_size;
-#endif
-
-        const float *block_inr0 = cblock_inr0;
-        const float *block_inr1 = cblock_inr1;
-        const float *block_inr2 = cblock_inr2;
-        const float *block_inr3 = cblock_inr3;
-        const float *block_inr4 = cblock_inr4;
-
-        //! get weights ptr of remained
-        const float *weight_c = weights + c_round_down * w_stride;
-
-        //! fill bias to one channel
-        const float *bias_ptr = ptr_zero;
-        if (bias != nullptr) {
-          bias_ptr = bias + c_round_down + c;
-        }
-        slidingwindow_fill_bias(pre_out, bias_ptr, 1, wout_round * h_kernel);
-
-        for (int hk = 0; hk < h_kernel; hk += hout_r_kernel) {
-          const float *wc0 = weight_c;
-
-          const float *inr0 = block_inr0;
-          const float *inr1 = block_inr1;
-          const float *inr2 = block_inr2;
-          const float *inr3 = block_inr3;
-          const float *inr4 = block_inr4;
-
-          float *pre_out0 = pre_out + hk * wout_round;
-          float *pre_out1 = pre_out0 + wout_round;
-#ifdef __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float32x4_t w0 = vdupq_n_f32(wc0[c]);       // w0, v23
-            float32x4_t w1 = vdupq_n_f32(wc0[c + 4]);   // w1, v24
-            float32x4_t w2 = vdupq_n_f32(wc0[c + 8]);   // w2, v25
-            float32x4_t w3 = vdupq_n_f32(wc0[c + 12]);  // w3, v26
-            float32x4_t w4 = vdupq_n_f32(wc0[c + 16]);  // w4, v27
-            float32x4_t w5 = vdupq_n_f32(wc0[c + 20]);  // w5, v28
-            float32x4_t w6 = vdupq_n_f32(wc0[c + 24]);  // w6, v29
-            float32x4_t w7 = vdupq_n_f32(wc0[c + 28]);  // w7, v30
-            float32x4_t w8 = vdupq_n_f32(wc0[c + 32]);  // w8, v31
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop;
-            asm volatile(
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr00, outr01,
-                                                           outr02, outr03*/
-
-                "ld2  {v0.4s, v1.4s}, [%[r0]], #32  \n" /* load input r0*/
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-                "ld2  {v4.4s, v5.4s}, [%[r2]], #32  \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "2:                                 \n" /* main loop*/
-                /*  r0, r2, mul w0, get out r0, r1 */
-                "ldr    q22, [%[ptr_out1]]          \n" /* load outr10, outr11,
-                                                           outr12, outr13*/
-
-                "fmla   v21.4s ,  %[w0].4s,  v0.4s  \n" /* outr0 = w0 * r0[0, 2,
-                                                           4, 6]*/
-                "fmla   v22.4s ,  %[w0].4s,  v4.4s  \n" /* outr1 = w0 * r2[0, 2,
-                                                           4, 6]*/
-
-                "ld2  {v2.4s, v3.4s}, [%[r1]], #32  \n" /* load input r1*/
-
-                /* r2 mul w6, get out r0*/
-                "fmla   v21.4s ,  %[w6].4s,  v4.4s  \n" /* outr0 = w6 * r2[0, 2,
-                                                           4, 6]*/
-                "ldr    d11,      [%[r1]]           \n" /* load input r1, 9th
-                                                           element*/
-
-                /* shift left 1 */
-                "ext    v15.16b, v0.16b, v10.16b, #4\n" /* shift left r0 1*/
-                "ext    v16.16b, v4.16b, v12.16b, #4\n" /* shift left r2 1*/
-
-                /*  r0, r2, mul w1, get out r0, r1 */
-                "fmla   v21.4s ,  %[w1].4s,  v1.4s  \n" /* outr0 = w1 * r0[1, 3,
-                                                           5, 7]*/
-                "fmla   v22.4s ,  %[w1].4s,  v5.4s  \n" /* outr1 = w1 * r2[1, 3,
-                                                           5, 7]*/
-
-                "ld2  {v6.4s, v7.4s}, [%[r3]], #32  \n" /* load input r3*/
-
-                /*  r2 mul w7, get out r0 */
-                "fmla   v21.4s ,  %[w7].4s,  v5.4s  \n" /* outr00 = w7 * r2[1,
-                                                           3, 5, 7]*/
-
-                "ldr    d13,      [%[r3]]           \n" /* load input r3, 9th
-                                                           element*/
-
-                /*  r0, r2, mul w2, get out r0, r1 */
-                "fmla   v21.4s ,  %[w2].4s,  v15.4s \n" /* outr0 = w2 * r0[2, 4,
-                                                           6, 8]*/
-                "fmla   v22.4s ,  %[w2].4s,  v16.4s \n" /* outr1 = w2 * r2[2, 4,
-                                                           6, 8]*/
-
-                "ld2  {v8.4s, v9.4s}, [%[r4]], #32  \n" /* load input r4*/
-
-                /*  r2, mul w8, get out r0 */
-                "fmla   v21.4s ,  %[w8].4s,  v16.4s \n" /* outr00 = w8 * r2[2,
-                                                           4, 6, 8]*/
-
-                "ldr    d14,      [%[r4]]           \n" /* load input r4, 9th
-                                                           element*/
-
-                /* r1, r3, mul w3, get out r0, r1 */
-                "fmla   v21.4s ,  %[w3].4s,  v2.4s  \n" /* outr0 = w3 * r1[0, 2,
-                                                           4, 6]*/
-                "fmla   v22.4s ,  %[w3].4s,  v6.4s  \n" /* outr1 = w3 * r3[0, 2,
-                                                           4, 6]*/
-
-                /* shift left 1 */
-                "ext    v15.16b, v2.16b, v11.16b, #4\n" /* shift left r1 1*/
-                "ext    v16.16b, v6.16b, v13.16b, #4\n" /* shift left r3 1*/
-
-                "ld2  {v0.4s, v1.4s}, [%[r0]], #32  \n" /* load input r0*/
-
-                /*  r1, r3, mul w4, get out r0, r1 */
-                "fmla   v21.4s ,  %[w4].4s,  v3.4s  \n" /* outr0 = w4 * r1[1, 3,
-                                                           5, 7]*/
-                "fmla   v22.4s ,  %[w4].4s,  v7.4s  \n" /* outr1 = w4 * r3[1, 3,
-                                                           5, 7]*/
-
-                "ldr    d10,      [%[r0]]           \n" /* load input r0, 9th
-                                                           element*/
-
-                /*  r1, r3, mul w5, get out r0, r1 */
-                "fmla   v21.4s ,  %[w5].4s,  v15.4s \n" /* outr0 = w5 * r1[2]*/
-                "fmla   v22.4s ,  %[w5].4s,  v16.4s \n" /* outr1 = w5 * r1[4]*/
-
-                "ld2  {v4.4s, v5.4s}, [%[r2]], #32  \n" /* load input r2*/
-                "ldr    d12,      [%[r2]]           \n" /* load input r2, 9th
-                                                           element*/
-                "str    q21, [%[ptr_out0]], #16     \n" /* save outr00, outr01*/
-
-                /*  r4, mul w6, get out r1 */
-                "fmla   v22.4s ,  %[w6].4s,  v8.4s  \n" /* outr1 = w6 * r4[0, 2,
-                                                           4, 6]*/
-
-                "ext    v15.16b, v8.16b, v14.16b, #4\n" /* shift left r1 1*/
-                "ldr    q21, [%[ptr_out0]]          \n" /* load outr0*/
-
-                /*  r4, mul w7, get out r1 */
-                "fmla   v22.4s ,  %[w7].4s,  v9.4s  \n" /* outr1 = w7 * r4[1, 3,
-                                                           5, 7]*/
-
-                /*  r4, mul w8, get out r1 */
-                "fmla   v22.4s ,  %[w8].4s,  v15.4s \n" /* outr1 = w8 * r4[2, 4,
-                                                           6, 8]*/
-
-                "subs   %w[cnt], %w[cnt], #1        \n" /*loop count -1*/
-                "str    q22, [%[ptr_out1]], #16     \n" /* save outr1*/
-                "bne    2b                          \n" /* jump to main loop*/
-
-                : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1), [r2] "+r"(r2),
-                  [r3] "+r"(r3), [r4] "+r"(r4), [ptr_out0] "+r"(ptr_out0),
-                  [ptr_out1] "+r"(ptr_out1)
-                : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2), [w3] "w"(w3),
-                  [w4] "w"(w4), [w5] "w"(w5), [w6] "w"(w6), [w7] "w"(w7),
-                  [w8] "w"(w8)
-                : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
-                  "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
-                  "v16", "v21", "v22");
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#else   // not __aarch64__
-          for (int i = 0; i < chin; ++i) {
-            float *ptr_out0 = pre_out0;
-            float *ptr_out1 = pre_out1;
-
-            //! get valid weights of current output channel
-            float w_tmp[12] = {wc0[c],      wc0[c + 4],  wc0[c + 8],  0.f,
-                               wc0[c + 12], wc0[c + 16], wc0[c + 20], 0.f,
-                               wc0[c + 24], wc0[c + 28], wc0[c + 32], 0.f};
-            float32x4_t w0 = vld1q_f32(w_tmp);      // w0, w1, w2, q0
-            float32x4_t w1 = vld1q_f32(w_tmp + 4);  // w3, w4, w5, q1
-            float32x4_t w2 = vld1q_f32(w_tmp + 8);  // w6, w7, w8, q2
-
-            const float *r0 = inr0;
-            const float *r1 = inr1;
-            const float *r2 = inr2;
-            const float *r3 = inr3;
-            const float *r4 = inr4;
-
-            int cnt = w_loop / 2;
-            if (cnt > 0) {
-              asm volatile(
-                  /* main loop */
-                  "0:                                                     @ "
-                  "main loop\n"
-                  "vld1.32    {d24-d27},    [%[ptr_out0]]         @ load or00, "
-                  "or01\n"
-                  "vld1.32    {d28-d31},    [%[ptr_out1]]         @ load or10, "
-                  "or11\n"
-                  "vld2.32    {d6-d9},    [%[r2]]!                @ load r2, 8 "
-                  "float, interleave\n"
-                  "vld2.32    {d10-d13},  [%[r2]]!                @ load r2, 8 "
-                  "float, interleave\n"
-                  "vld1.32    {d22},  [%[r2]]                     @ load 16th "
-                  "float\n"
-
-                  /* r2 * w2, r2 * w0, get or0, or1 */
-                  "vmla.f32   q12,    q4, %e[w2][1]               @ w21 * r2, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q6, %e[w2][1]               @ w21 * r2, "
-                  "9, 11, 13, 15\n"
-                  "vld2.32    {d14-d17},    [%[r0]]!              @ load r0, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q4, %e[w0][1]               @ w01 * r2, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w0][1]               @ w01 * r2, "
-                  "9, 11, 13, 15\n"
-
-                  "vext.32    q4, q3, q5, #1                      @ r2, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r2, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-
-                  "vmla.f32   q12,    q3, %e[w2][0]               @ w20 * r2, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q5, %e[w2][0]               @ w20 * r2, "
-                  "8, 10, 12, 14\n"
-                  "vld2.32    {d18-d21},  [%[r0]]!                @ load r0, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q3, %e[w0][0]               @ w00 * r2, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w0][0]               @ w00 * r2, "
-                  "8, 10, 12, 14\n"
-
-                  "vld1.32    {d22},  [%[r0]]                     @ load 16th "
-                  "float\n"
-
-                  "vmla.f32   q12,    q4, %f[w2][0]               @ w22 * r2, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q14,    q4, %f[w0][0]               @ w02 * r2, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d6-d9},    [%[r3]]!                @ load r3, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q13,    q6, %f[w2][0]               @ w22 * r2, "
-                  "10, 12, 14, 16\n"
-                  "vmla.f32   q15,    q6, %f[w0][0]               @ w02 * r2, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d10-d13},  [%[r3]]!                @ load r3, 8 "
-                  "float, interleave\n"
-
-                  /* r0 * w0, get or0, r3 * w1, get or1*/
-                  "vmla.f32   q12,    q8, %e[w0][1]               @ w01 * r0, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q10, %e[w0][1]              @ w01 * r0, "
-                  "9, 11, 13, 15\n"
-                  "vext.32    q8, q7, q9, #1                      @ r0, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q10, q9, q11, #1                    @ r0, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vld1.32    {d22},  [%[r3]]                     @ load 16th "
-                  "float\n"
-                  "vmla.f32   q14,    q4, %e[w1][1]               @ w11 * r3, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w1][1]               @ w11 * r3, "
-                  "9, 11, 13, 15\n"
-
-                  "vmla.f32   q12,    q7, %e[w0][0]               @ w00 * r0, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q9, %e[w0][0]               @ w00 * r0, "
-                  "8, 10, 12, 14\n"
-                  "vext.32    q4, q3, q5, #1                      @ r3, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r3, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q3, %e[w1][0]               @ w10 * r3, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w1][0]               @ w10 * r3, "
-                  "8, 10, 12, 14\n"
-
-                  "vmla.f32   q12,    q8, %f[w0][0]               @ w02 * r0, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d14-d17},  [%[r1]]!                @ load r1, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q13,    q10,%f[w0][0]               @ w02 * r0, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d18-d21},  [%[r1]]!                @ load r1, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q14,    q4, %f[w1][0]               @ w12 * r3, "
-                  "2, 4, 6, 8\n"
-                  "vld2.32    {d6-d9},    [%[r4]]!                @ load r4, 8 "
-                  "float, interleave\n"
-                  "vmla.f32   q15,    q6, %f[w1][0]               @ w12 * r3, "
-                  "10, 12, 14, 16\n"
-                  "vld2.32    {d10-d13},  [%[r4]]!                @ load r4, 8 "
-                  "float, interleave\n"
-
-                  "vld1.32    {d22},  [%[r1]]                     @ load 16th "
-                  "float\n"
-
-                  /* r1 * w1, get or0, r4 * w2, get or1 */
-                  "vmla.f32   q12,    q8, %e[w1][1]               @ w11 * r1, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q13,    q10, %e[w1][1]              @ w11 * r1, "
-                  "9, 11, 13, 15\n"
-                  "vext.32    q8, q7, q9, #1                      @ r1, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q10, q9, q11, #1                    @ r1, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q4, %e[w2][1]               @ w21 * r4, "
-                  "1, 3, 5, 7\n"
-                  "vmla.f32   q15,    q6, %e[w2][1]               @ w21 * r4, "
-                  "9, 11, 13, 15\n"
-                  "vld1.32    {d22},  [%[r4]]                     @ load 16th "
-                  "float\n"
-
-                  "vmla.f32   q12,    q7, %e[w1][0]               @ w10 * r1, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q13,    q9, %e[w1][0]               @ w10 * r1, "
-                  "8, 10, 12, 14\n"
-                  "vext.32    q4, q3, q5, #1                      @ r1, shift "
-                  "left 1, get 2, 4, 6, 8\n"
-                  "vext.32    q6, q5, q11, #1                     @ r1, shift "
-                  "left 1, get 10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q3, %e[w2][0]               @ w20 * r4, "
-                  "0, 2, 4, 6\n"
-                  "vmla.f32   q15,    q5, %e[w2][0]               @ w20 * r4, "
-                  "8, 10, 12, 14\n"
-
-                  "vmla.f32   q12,    q8, %f[w1][0]               @ w12 * r1, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q13,    q10, %f[w1][0]              @ w12 * r1, "
-                  "10, 12, 14, 16\n"
-                  "vmla.f32   q14,    q4, %f[w2][0]               @ w22 * r4, "
-                  "2, 4, 6, 8\n"
-                  "vmla.f32   q15,    q6, %f[w2][0]               @ w22 * r4, "
-                  "10, 12, 14, 16\n"
-
-                  "vst1.32    {d24-d27},  [%[ptr_out0]]!          @ save or0\n"
-                  "vst1.32    {d28-d31},  [%[ptr_out1]]!          @ save or0\n"
-
-                  "subs   %[cnt], #1                              @loop count "
-                  "-1\n"
-                  "bne    0b                                      @ jump to "
-                  "main loop\n"
-
-                  : [cnt] "+r"(cnt), [r0] "+r"(r0), [r1] "+r"(r1),
-                    [r2] "+r"(r2), [r3] "+r"(r3), [r4] "+r"(r4),
-                    [ptr_out0] "+r"(ptr_out0), [ptr_out1] "+r"(ptr_out1)
-                  : [w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2)
-                  : "cc", "memory", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-                    "q10", "q11", "q12", "q13", "q14", "q15");
-            }
-            //! deal with remain wout
-            if (w_loop & 1) {
-              ptr_out0[0] +=
-                  r0[0] * w_tmp[0] + r0[1] * w_tmp[1] + r0[2] * w_tmp[2] +
-                  r1[0] * w_tmp[4] + r1[1] * w_tmp[5] + r1[2] * w_tmp[6] +
-                  r2[0] * w_tmp[8] + r2[1] * w_tmp[9] + r2[2] * w_tmp[10];
-
-              ptr_out0[1] +=
-                  r0[2] * w_tmp[0] + r0[3] * w_tmp[1] + r0[4] * w_tmp[2] +
-                  r1[2] * w_tmp[4] + r1[3] * w_tmp[5] + r1[4] * w_tmp[6] +
-                  r2[2] * w_tmp[8] + r2[3] * w_tmp[9] + r2[4] * w_tmp[10];
-
-              ptr_out0[2] +=
-                  r0[4] * w_tmp[0] + r0[5] * w_tmp[1] + r0[6] * w_tmp[2] +
-                  r1[4] * w_tmp[4] + r1[5] * w_tmp[5] + r1[6] * w_tmp[6] +
-                  r2[4] * w_tmp[8] + r2[5] * w_tmp[9] + r2[6] * w_tmp[10];
-
-              ptr_out0[3] +=
-                  r0[6] * w_tmp[0] + r0[7] * w_tmp[1] + r0[8] * w_tmp[2] +
-                  r1[6] * w_tmp[4] + r1[7] * w_tmp[5] + r1[8] * w_tmp[6] +
-                  r2[6] * w_tmp[8] + r2[7] * w_tmp[9] + r2[8] * w_tmp[10];
-
-              ptr_out1[0] +=
-                  r2[0] * w_tmp[0] + r2[1] * w_tmp[1] + r2[2] * w_tmp[2] +
-                  r3[0] * w_tmp[4] + r3[1] * w_tmp[5] + r3[2] * w_tmp[6] +
-                  r4[0] * w_tmp[8] + r4[1] * w_tmp[9] + r4[2] * w_tmp[10];
-
-              ptr_out1[1] +=
-                  r2[2] * w_tmp[0] + r2[3] * w_tmp[1] + r2[4] * w_tmp[2] +
-                  r3[2] * w_tmp[4] + r3[3] * w_tmp[5] + r3[4] * w_tmp[6] +
-                  r4[2] * w_tmp[8] + r4[3] * w_tmp[9] + r4[4] * w_tmp[10];
-
-              ptr_out1[2] +=
-                  r2[4] * w_tmp[0] + r2[5] * w_tmp[1] + r2[6] * w_tmp[2] +
-                  r3[4] * w_tmp[4] + r3[5] * w_tmp[5] + r3[6] * w_tmp[6] +
-                  r4[4] * w_tmp[8] + r4[5] * w_tmp[9] + r4[6] * w_tmp[10];
-
-              ptr_out1[3] +=
-                  r2[6] * w_tmp[0] + r2[7] * w_tmp[1] + r2[8] * w_tmp[2] +
-                  r3[6] * w_tmp[4] + r3[7] * w_tmp[5] + r3[8] * w_tmp[6] +
-                  r4[6] * w_tmp[8] + r4[7] * w_tmp[9] + r4[8] * w_tmp[10];
-            }
-
-            wc0 += 36;
-            inr0 += win_round;
-            inr1 += win_round;
-            inr2 += win_round;
-            inr3 += win_round;
-            inr4 += win_round;
-          }
-#endif  // __aarch64__
-          block_inr0 = block_inr4;
-          block_inr1 = block_inr0 + in_len;
-          block_inr2 = block_inr1 + in_len;
-          block_inr3 = block_inr2 + in_len;
-          block_inr4 = block_inr3 + in_len;
-        }
-        slidingwindow_writeout_c1_fp32(
-            pre_out, dout_batch, c + c_round_down, c + c_round_down + 1, h,
-            h + h_kernel, 0, wout_round, chout, hout, wout, relu, ptr_write);
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_conv3x3.h b/mobile/src/operators/math/slidingwindow_conv3x3.h
deleted file mode 100644
index 8bdd682cdb3075767fd2ed2119ebf22b7158da8a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/slidingwindow_conv3x3.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <vector>
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s1(const framework::Tensor *input,
-                            const framework::Tensor *filter,
-                            const std::vector<int> &paddings,
-                            framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s2(const framework::Tensor *input,
-                            const framework::Tensor *filter,
-                            const std::vector<int> &paddings,
-                            framework::Tensor *output);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s1Faster(const framework::Tensor *input,
-                                  framework::Tensor *filter,
-                                  const std::vector<int> &paddings,
-                                  framework::Tensor *output, const float *bias,
-                                  bool is_bias, bool is_relu);
-
-template <typename Itype, typename Otype>
-void SlidingwindowConv3x3s2Faster(const framework::Tensor *input,
-                                  framework::Tensor *filter,
-                                  const std::vector<int> &paddings,
-                                  framework::Tensor *output, const float *bias,
-                                  bool is_bias, bool is_relu);
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_utils.cpp b/mobile/src/operators/math/slidingwindow_utils.cpp
deleted file mode 100644
index cd20612482703bd6f772a07a249a7a9f5c4fdb29..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/slidingwindow_utils.cpp
+++ /dev/null
@@ -1,365 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/slidingwindow_utils.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num,
-                             int ch_size) {
-  for (int j = 0; j < ch_num; j++) {
-    float32x4_t vb = vdupq_n_f32(bias[j]);
-    int i = 0;
-    for (; i < ch_size - 3; i += 4) {
-      vst1q_f32(dout + i, vb);
-    }
-    for (; i < ch_size; i++) {
-      dout[i] = bias[j];
-    }
-    dout += ch_size;
-  }
-}
-
-/* write result in outputs
- * input din: [n, c, h, w], output dout: [n, c, h, w]
- */
-void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr) {
-  if (cs > channel) {
-    return;
-  }
-
-  const int c1 = 1;
-  const int w4 = 4;
-
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int w_round = we - ws;
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    const float* din_hei_ptr = ptr_din + i * w_round * c1;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop */
-            "fmax   v1.4s, v0.4s, v20.4s    \n" /* relu */
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0 */
-            "bne    1b                      \n" /* jump to main loop */
-            : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data, c0r0, c1r0, "
-            "c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-
-            "vmax.f32   q1, q0, q15                 @ relu\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-
-            "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "1:                             \n" /* main loop */
-            "str    q0, [%[doutc0r0]], #16  \n" /* store c2r0 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "bne    1b                      \n" /* jump to main loop */
-
-            : [doutc0r0] "+r"(doutc0_ptr), [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data, c0r0, c0r1, "
-            "c0r2, c0r3\n"
-            "1:                                     @ main loop\n"
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c1 + c1 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f);
-          din_hei_ptr++;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = *(din_hei_ptr++);
-        }
-      }
-    }
-  }
-}
-
-/* write result in outputs
- * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
- */
-void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr) {
-  const int c4 = 4;
-  const int w4 = 4;
-  const int w_round = we - ws;
-  const int ch_n = ce - cs;
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-  float* doutc2r0 = doutc1r0 + size_c_out;
-  float* doutc3r0 = doutc2r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int cnt = (width - ws) / w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    float* doutc1_ptr = doutc1r0 + size_w;
-    float* doutc2_ptr = doutc2r0 + size_w;
-    float* doutc3_ptr = doutc3r0 + size_w;
-    if (ce > channel) {
-      switch (ce - channel) {
-        case 3:
-          doutc1_ptr = trash_ptr;
-        case 2:
-          doutc2_ptr = trash_ptr;
-        case 1:
-          doutc3_ptr = trash_ptr;
-        default:
-          break;
-      }
-    }
-    const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop */
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /* relu */
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /* relu */
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /* relu */
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /* relu */
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0 */
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0 */
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0 */
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0 */
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "bne    1b                      \n" /* jump to main loop */
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-              "v11", "v12", "v13", "v14", "v16", "v17", "v18", "v19", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-            "vmov.u32 q15, #0                     @ dump zero \n"
-            "1:                                   @ main loop \n"
-            "vtrn.32 q0, q1                       @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                       @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                        @ swap data\n"
-            "vswp   d3, d6                        @ swap data\n"
-
-            "vmax.f32   q0, q0, q15               @ relu\n"
-            "vmax.f32   q1, q1, q15               @ relu\n"
-            "vmax.f32   q2, q2, q15               @ relu\n"
-            "vmax.f32   q3, q3, q15               @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop */
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1 */
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10 */
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11 */
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0 */
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0 */
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0 */
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0 */
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1 */
-            "bne    1b                      \n" /* jump to main loop */
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop), [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17",
-              "v18", "v19");
-#else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-            "1:                                   @ main loop \n"
-            "vtrn.32 q0, q1                       @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                       @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                        @ swap data\n"
-            "vswp   d3, d6                        @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1            @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!       @ load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!       @ load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr), [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr), [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr), [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3");
-#endif
-      }
-    }
-    if (we > width) {
-      int offset = i * w_round * c4 + c4 * w4 * cnt;
-      din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = std::max(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = std::max(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = std::max(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = std::max(din_hei_ptr[3], 0.f);
-          din_hei_ptr += w4;
-        }
-      } else {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = din_hei_ptr[0];
-          *(doutc1_ptr++) = din_hei_ptr[1];
-          *(doutc2_ptr++) = din_hei_ptr[2];
-          *(doutc3_ptr++) = din_hei_ptr[3];
-          din_hei_ptr += w4;
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/slidingwindow_utils.h b/mobile/src/operators/math/slidingwindow_utils.h
deleted file mode 100644
index 6db22bcf5fef126a8a830a0c30da87331fea5e0a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/slidingwindow_utils.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include "framework/tensor.h"
-
-#if __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-/* preprocessing weights
- * input weights: [chout, chin/ group, kh, kw] --> outputs weights: [chout / n,
- * chin/ group, kh, kw, n]
- */
-template <typename dtype>
-void slidingwindow_transform_weight(const framework::Tensor& weight,
-                                    framework::Tensor* output) {
-  int chout = weight.dims()[0];
-  int chin = weight.dims()[1];
-  int kernel_size = weight.dims()[2] * weight.dims()[3];
-  const int n = 4;
-  int cround = (chout + n - 1) / n * n;
-  const dtype* din = weight.data<dtype>();
-  dtype* dout = output->mutable_data<dtype>({cround, chin, 3, 3});
-  int c_loop = chout / n;
-  int chout_round = (chout + n - 1) / n;
-  int win_stride = chin * kernel_size;
-  int wout_stride = n * win_stride;
-  int co = 0;
-  for (; co < c_loop; ++co) {
-    dtype* dout_c = dout + co * wout_stride;
-    const dtype* din_array[n];
-    din_array[0] = din + co * wout_stride;
-    for (int i = 1; i < n; i++) {
-      din_array[i] = din_array[i - 1] + win_stride;
-    }
-    for (int ci = 0; ci < chin; ++ci) {
-      for (int k = 0; k < kernel_size; ++k) {
-        for (int i = 0; i < n; i++) {
-          *(dout_c++) = *(din_array[i]++);
-        }
-      }
-    }
-  }
-  // pad final chout
-  if (chout_round > c_loop) {
-    dtype* dout_c = dout + c_loop * wout_stride;
-    const dtype* din_array[n];
-    din_array[0] = din + c_loop * wout_stride;
-    for (int i = 1; i < n; i++) {
-      din_array[i] = din_array[i - 1] + win_stride;
-    }
-    // deal remain
-    int cremain = chout_round * n - chout;
-    for (int i = 1; i <= cremain; i++) {
-      din_array[n - i] = din_array[0];
-    }
-    for (int ci = 0; ci < chin; ++ci) {
-      for (int k = 0; k < kernel_size; ++k) {
-        for (int i = 0; i < n; i++) {
-          *(dout_c++) = *(din_array[i]++);
-        }
-      }
-    }
-  }
-}
-
-/* preprocessing inputs
- * input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
- * n = he - hs
- */
-template <typename dtype>
-void slidingwindow_prepack_input(const dtype* din, dtype* dout, int cs, int ce,
-                                 int hs, int he, int ws, int we, int channel,
-                                 int width, int height, dtype* zero_ptr) {
-  int n = he - hs;
-  int w0 = ws < 0 ? 0 : ws;
-  int w1 = we > width ? width : we;
-
-  int size_w = we - ws;
-  int size_wc_len = size_w * channel;
-  int size_c = width * height;
-
-  int valid_w = w1 - w0;
-  size_t valid_w_byte = valid_w * sizeof(dtype);
-
-  dtype* out_array[n];
-  out_array[0] = dout;
-  for (int i = 1; i < n; i++) {
-    out_array[i] = out_array[i - 1] + size_wc_len;
-  }
-
-  for (int c = 0; c < channel; ++c) {
-    int j = 0;
-    // valid height
-    for (int i = hs; i < he; i++) {
-      // get address
-      const dtype* in_array;
-      if (i < 0 || i >= height) {
-        in_array = zero_ptr;
-      } else {
-        in_array = din + i * width;
-      }
-
-      for (int w = ws; w < w0; ++w) {
-        *(out_array[j]++) = 0.f;
-      }
-      memcpy(out_array[j], in_array, valid_w_byte);
-      out_array[j] += valid_w;
-      for (int w = w1; w < we; ++w) {
-        *(out_array[j]++) = 0.f;
-      }
-      j++;
-    }
-    din += size_c;
-  }
-}
-
-inline void slidingwindow_fill_bias(float* dout, const float* bias, int size) {
-  float32x4_t vb = vld1q_f32(bias);
-  int cnt = size / 4;
-  for (int i = 0; i < cnt; ++i) {
-    vst1q_f32(dout, vb);
-    dout += 4;
-  }
-}
-
-void slidingwindow_fill_bias(float* dout, const float* bias, int ch_num,
-                             int ch_size);
-
-void slidingwindow_writeout_c1_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr);
-
-void slidingwindow_writeout_c4_fp32(const float* din, float* dout, int cs,
-                                    int ce, int hs, int he, int ws, int we,
-                                    int channel, int height, int width,
-                                    bool flag_relu, float* trash_ptr);
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/softmax.cpp b/mobile/src/operators/math/softmax.cpp
deleted file mode 100644
index e066b0cccddf9a43953182788508aca4769fcd27..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/softmax.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/math/softmax.h"
-#include <math.h>
-#include <algorithm>
-#include <limits>
-#include "common/types.h"
-#include "operators/math/math.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifndef __aarch64__
-inline float32_t vmaxvq_f32(const float32x4_t &r) {
-  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpmax_f32(v, v), 0);
-}
-
-inline float32_t vaddvq_f32(const float32x4_t &r) {
-  float32x2_t v = vadd_f32(vget_high_f32(r), vget_low_f32(r));
-  return vget_lane_f32(vpadd_f32(v, v), 0);
-}
-#endif  // __aarch64__
-#endif  // __ARM_NEON__
-
-float find_max(const float *input, const int num_classes) {
-  int remain = num_classes;
-  float max = -std::numeric_limits<float>::max();
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  for (int i = 0; i < loop; ++i, input += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    __max = vmaxq_f32(x0, __max);
-    __max = vmaxq_f32(x1, __max);
-  }
-  max = vmaxvq_f32(__max);
-#endif
-  for (int i = 0; i < remain; ++i) {
-    max = std::max(max, input[i]);
-  }
-  return max;
-}
-
-void SoftmaxBasic(const float *input, int num_classes, float *y) {
-  float *output = y;
-  // find max
-  float max = find_max(input, num_classes);
-
-  // exp(x - max) and sum(exp(x - max))
-  int remain = num_classes;
-  float sum = 0.f;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  int loop = num_classes >> 3;
-  remain = num_classes & 0x7;
-  float32x4_t __max = vdupq_n_f32(max);
-  float32x4_t __sum = vdupq_n_f32(0.f);
-  for (int i = 0; i < loop; ++i, input += 8, output += 8) {
-    float32x4_t x0 = vld1q_f32(input);
-    float32x4_t x1 = vld1q_f32(input + 4);
-    x0 = vsubq_f32(x0, __max);
-    x1 = vsubq_f32(x1, __max);
-    x0 = exp_ps(x0);
-    x1 = exp_ps(x1);
-    __sum = vaddq_f32(x0, __sum);
-    __sum = vaddq_f32(x1, __sum);
-    vst1q_f32(output, x0);
-    vst1q_f32(output + 4, x1);
-  }
-  sum += vaddvq_f32(__sum);
-#endif  // __ARM_NEON__
-  for (int i = 0; i < remain; ++i) {
-    float out = expf(input[i] - max);
-    sum += out;
-    output[i] = out;
-  }
-
-  // exp(x - max) / sum
-  float inv_sum = 1.f / sum;
-  output = y;
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-  float32x4_t __inv_sum = vdupq_n_f32(inv_sum);
-  for (int i = 0; i < loop; ++i, output += 8) {
-    float32x4_t x0 = vld1q_f32(output);
-    float32x4_t x1 = vld1q_f32(output + 4);
-    x0 = vmulq_f32(x0, __inv_sum);
-    x1 = vmulq_f32(x1, __inv_sum);
-    vst1q_f32(output, x0);
-    vst1q_f32(output + 4, x1);
-  }
-#endif
-  for (int i = 0; i < remain; ++i) {
-    output[i] *= inv_sum;
-  }
-}
-
-template <>
-void SoftmaxFuntor<CPU, float>::operator()(const framework::Tensor *X,
-                                           framework::Tensor *Y) {
-  const framework::DDim &dims = X->dims();
-  int batch_size = dims[0];
-  int num_classes = dims[dims.size() - 1];
-  int channels = X->numel() / batch_size / num_classes;
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  #pragma omp parallel for collapse(2)
-  for (int batch = 0; batch < X->dims()[0]; ++batch) {
-    for (int channel = 0; channel < channels; ++channel) {
-      size_t offset = (batch * channels + channel) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      SoftmaxBasic(input, num_classes, output);
-    }
-  }
-}
-
-template <>
-void SequenceSoftmaxFuntor<CPU, float>::operator()(
-    const framework::LoDTensor *X, framework::LoDTensor *Y) {
-  const float *x = X->data<float>();
-  const auto &lod = X->lod().back();
-  float *y = Y->mutable_data<float>();
-
-  #pragma omp parallel for
-  for (int batch = 0; batch < lod.size() - 1; ++batch) {
-    int num_classes = lod[batch + 1] - lod[batch];
-    size_t offset = lod[batch];
-    const float *input = x + offset;
-    float *output = y + offset;
-    SoftmaxBasic(input, num_classes, output);
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SOFTMAX_OP
diff --git a/mobile/src/operators/math/softmax.h b/mobile/src/operators/math/softmax.h
deleted file mode 100644
index dff25b9d0271db9f6d5704adaaf147629be56a32..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/softmax.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#if defined(SOFTMAX_OP) || defined(SEQUENCE_SOFTMAX_OP)
-
-#pragma once
-
-#include "framework/lod_tensor.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <typename Device, typename T>
-class SoftmaxFuntor {
- public:
-  void operator()(const framework::Tensor *X, framework::Tensor *Y);
-};
-
-template <typename Device, typename T>
-class SequenceSoftmaxFuntor {
- public:
-  void operator()(const framework::LoDTensor *X, framework::LoDTensor *Y);
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/transform.h b/mobile/src/operators/math/transform.h
deleted file mode 100644
index 7a31e12ef241ec989f8b7ea91d084a46c0d51ed5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/transform.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-// Transform applys a unary or a binary functor on each element in a
-// range defined by a pair of iterators.
-//
-// - The specialization for CPU calls std::transform.
-// - The specialization for CUDA calls thrust::tranform.
-//
-// NOTE: We need to define InputIter and OutputIter defined as
-//       different types, because the InputIter points op's inputs
-//       and
-//       OutputIter pints to op's outputs.
-//
-// NOTE: We don't assume that InputIter to be const InputType* and
-//       OutputIter to be OutputType*, because we might use a
-//       iterator
-//       class, paddle::fluid::operators::RowwiseTRansformIterator.
-
-struct Transform {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(InputIter first, InputIter last, OutputIter result,
-                  UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1, typename InputIter2, typename OutputIter,
-            typename BinaryOperation>
-  void operator()(InputIter1 first1, InputIter1 last1, InputIter2 first2,
-                  OutputIter result, BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/vol2col.cpp b/mobile/src/operators/math/vol2col.cpp
deleted file mode 100644
index 9311e9e2291709631bc8ee07d2cc94f9ca99f4c2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/vol2col.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/math/vol2col.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-using Tensor = paddle_mobile::framework::Tensor;
-/*
- * vol = [input_channels, input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <typename T>
-class Vol2ColFunctor<CPU, T> {
- public:
-  void operator()(const Tensor &vol, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *col) const {
-    int input_channels = vol.dims()[0];
-    int input_depth = vol.dims()[1];
-    int input_height = vol.dims()[2];
-    int input_width = vol.dims()[3];
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    const T *vol_data = vol.data<T>();
-    T *col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx =
-                ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                    input_width +
-                w_pad;
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
-/*
- * vol = [input_channels,input_depth, input_height, input_width]
- * col =
- *   [input_channels, filter_depth, filter_height, filter_width,
- *                    output_depth, output_height, output_width]
- */
-template <typename T>
-class Col2VolFunctor<CPU, T> {
- public:
-  void operator()(const Tensor &col, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *vol) const {
-    int input_channels = vol->dims()[0];
-    int input_depth = vol->dims()[1];
-    int input_height = vol->dims()[2];
-    int input_width = vol->dims()[3];
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    T *vol_data = vol->data<T>();
-    const T *col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx =
-                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                      input_width +
-                  w_pad;
-
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template class Vol2ColFunctor<CPU, float>;
-template class Vol2ColFunctor<CPU, int8_t>;
-template class Col2VolFunctor<CPU, float>;
-template class Col2VolFunctor<CPU, int8_t>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/vol2col.h b/mobile/src/operators/math/vol2col.h
deleted file mode 100644
index 772bdf809a15a49f4e13d27ce674790a61c9c6b0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/vol2col.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "common/types.h"
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-/*
- * \brief Converts the feature data of four dimensions(CDHW) into a
- * colData of
- *        seven dimensions in the Vol2ColFunctor calculation,
- *        And in the Col2VolFunctor calculation, it is reversed.
- *
- * \param volData   Vol data.
- * \param volShape  The shape of volData,
- *                 [input_channels, input_depth, input_height,
- * input_width].
- * \param colData  Column data.
- * \param colShape The shape of colData.
- *
- * \param dilations    dilation data.
- * \param 3-dimension  [dilation_depth, dilation_height,
- * dilation_width].
- *
- * \param strides      stride data.
- * \param 3-dimension  [stride_depth, stride_height, stride_width].
- *
- * \param paddings     padding data.
- * \param 3-dimension  [d_pad, h_pad, w_pad].
- *
- * The shape of colData is:
- * [input_channels, filter_depth, filter_height, filter_width,
- * output_depth,
- * output_height, output_width]
- * So, it is easy to reshape into a convolution matrix for
- * convolution
- * calculation based on matrix multiplication.
- * The shape of convolution matrix is [height, width], where the
- * height is equal
- * input_channels * filter_depth * filter_height * filter_width, and
- * the width
- * is equal output_depth * output_height * output_width.
- *
- * Reshape:
- *     shape of colData           shape of convolution matrix
- *     [input_channels,
- *      filter_depth,
- *      filter_height,
- *      filter_width,      ======>      [height, width]
- *      output_depth,
- *      output_height,
- *      output_width]
- *
- * \note The caller needs to ensure that volShape.inputChannels is
- * equal to
- *       colShape.inputChannels.
- */
-using Tensor = paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class Vol2ColFunctor {
- public:
-  void operator()(const Tensor &vol, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *col) const;
-};
-
-template <typename DeviceType, typename T>
-class Col2VolFunctor {
- public:
-  void operator()(const Tensor &col, const std::vector<int> &dilations,
-                  const std::vector<int> &strides,
-                  const std::vector<int> &paddings, Tensor *vol) const;
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/winograd/winograd_transform.h b/mobile/src/operators/math/winograd/winograd_transform.h
deleted file mode 100644
index 599a9b9233becdc0d7bdc7f8ef12b9d4cccd60d0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/winograd/winograd_transform.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef CONV_OP
-
-#pragma once
-
-#include "framework/tensor.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <int tile, int kernel>
-void winograd_transform_weight(const framework::Tensor &weight,
-                               framework::Tensor *output);
-
-template <int tile, int kernel>
-void winograd_transform_input(const framework::Tensor &input,
-                              framework::Tensor *output);
-
-template <int tile, int kernel>
-void winograd_transform_output(const framework::Tensor &input,
-                               const framework::Tensor &weight,
-                               framework::Tensor *output);
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp b/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp
deleted file mode 100644
index 4ba0ee4cb60b569cc7208c10fe1983e16dbfffbb..0000000000000000000000000000000000000000
--- a/mobile/src/operators/math/winograd/winograd_transform_f6k3.cpp
+++ /dev/null
@@ -1,1681 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
-// project.
-
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#ifdef CONV_OP
-
-#include <arm_neon.h>
-#include "operators/math/pad.h"
-#include "operators/math/winograd/winograd_transform.h"
-
-namespace paddle_mobile {
-namespace operators {
-namespace math {
-
-template <>
-void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
-                                     framework::Tensor *output) {
-  /*
-   * w0 = g0
-   * w1 = ((g0 + g2) + g1) * (-2.0 / 9)
-   * w2 = ((g0 + g2) - g1) * (-2.0 / 9)
-   * w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
-   * w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
-   * w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
-   * w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
-   * w7 = g2
-   */
-  // weight shape is [out_channel, in_channel, kernel_h, kernel_w]
-  // package weight into [roundup(out_channel/4), 64, in_channel, 4] tiles
-  int out_channel = weight.dims()[0];
-  int in_channel = weight.dims()[1];
-  // reshape and alloc transformed weight
-  framework::DDim transformed_shape = framework::make_ddim(
-      std::vector<int>{(out_channel + 3) / 4, 64, in_channel, 4});
-  float *trans_outptr = output->mutable_data<float>(transformed_shape);
-  memset(trans_outptr, 0, output->numel() * sizeof(float));
-
-  const float transform_matrix[8] = {2.f, -2.f / 9, 1.f / 90, 1.f / 180};
-  const float *inptr = weight.data<float>();
-
-#if __aarch64__
-  int remain_start = 0;
-#else
-  int remain_start = out_channel & 0xFFFFFFFC;
-
-  #pragma omp parallel for
-  for (int oc = 0; oc < out_channel - 3; oc += 4) {
-    float gw[96];  // gw[3][8][4]
-    const float *inptr0 = inptr + oc * in_channel * 9;
-    const float *inptr1 = inptr + (oc + 1) * in_channel * 9;
-    const float *inptr2 = inptr + (oc + 2) * in_channel * 9;
-    const float *inptr3 = inptr + (oc + 3) * in_channel * 9;
-    // oc * 64 * in_channel
-    float *outptr = trans_outptr + ((oc * in_channel) << 6);
-    for (int ic = 0; ic < in_channel; ++ic) {
-      float *gw_ptr = gw;
-      asm volatile(
-          "vld1.32    {d0-d1}, [%[tm_ptr]]          \n"
-
-          "mov        r0, #24                       \n"
-          "vld1.32    {d2-d5}, [%[inptr0]], r0      \n"
-          "vld1.32    {d6-d9}, [%[inptr1]], r0      \n"
-          "vld1.32    {d10-d13}, [%[inptr2]], r0    \n"
-          "vld1.32    {d14-d17}, [%[inptr3]], r0    \n"
-          "vtrn.32    q1, q3                        \n"
-          "vtrn.32    q2, q4                        \n"
-          "vtrn.32    q5, q7                        \n"
-          "vtrn.32    q6, q8                        \n"
-          "vswp.32    d3, d10                       \n"
-          "vswp.32    d7, d14                       \n"
-          "vswp.32    d5, d12                       \n"
-          "vswp.32    d9, d16                       \n"
-
-          // q1: g0, q3: g1, q5: g2
-          "vst1.32    {d2-d3}, [%[gw_ptr]]!         \n"
-          "vadd.f32   q9, q1, q5                    \n"
-          "vadd.f32   q10, q9, q3                   \n"
-          "vsub.f32   q11, q9, q3                   \n"
-          "vmul.f32   q10, q10, d0[1]               \n"
-          "vst1.32    {d20-d21}, [%[gw_ptr]]!       \n"
-          "vmul.f32   q11, q11, d0[1]               \n"
-          "vst1.32    {d22-d23}, [%[gw_ptr]]!       \n"
-
-          "vmul.f32   q9, q1, d0[0]                 \n"
-          "vmul.f32   q9, q9, d0[0]                 \n"  // 4 * g0
-          "vmul.f32   q10, q3, d0[0]                \n"  // 2 * g1
-          "vmul.f32   q11, q5, d0[0]                \n"
-          "vmul.f32   q11, q11, d0[0]               \n"  // 4 * g2
-
-          "vadd.f32   q12, q1, q11                  \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vadd.f32   q12, q5, q9                   \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vst1.32    {d10-d11}, [%[gw_ptr]]!       \n"
-
-          // q7: g0, q2: g1, q4: g2
-          "vst1.32    {d14-d15}, [%[gw_ptr]]!       \n"
-          "vadd.f32   q9, q7, q4                    \n"
-          "vadd.f32   q10, q9, q2                   \n"
-          "vsub.f32   q11, q9, q2                   \n"
-          "vmul.f32   q10, q10, d0[1]               \n"
-          "vst1.32    {d20-d21}, [%[gw_ptr]]!       \n"
-          "vmul.f32   q11, q11, d0[1]               \n"
-          "vst1.32    {d22-d23}, [%[gw_ptr]]!       \n"
-
-          "vmul.f32   q9, q7, d0[0]                 \n"
-          "vmul.f32   q9, q9, d0[0]                 \n"  // 4 * g0
-          "vmul.f32   q10, q2, d0[0]                \n"  // 2 * g1
-          "vmul.f32   q11, q4, d0[0]                \n"
-          "vmul.f32   q11, q11, d0[0]               \n"  // 4 * g2
-
-          "vadd.f32   q12, q7, q11                  \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vadd.f32   q12, q4, q9                   \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vst1.32    {d8-d9}, [%[gw_ptr]]!         \n"
-
-          "mov        r0, #12                       \n"
-          "vld1.32    {d2-d3}, [%[inptr0]], r0      \n"
-          "vld1.32    {d6-d7}, [%[inptr1]], r0      \n"
-          "vld1.32    {d10-d11}, [%[inptr2]], r0    \n"
-          "vld1.32    {d14-d15}, [%[inptr3]], r0    \n"
-          "vtrn.32    q1, q3                        \n"
-          "vtrn.32    q5, q7                        \n"
-          "vswp.32    d3, d10                       \n"
-          "vswp.32    d7, d14                       \n"
-
-          // q1: g0, q3: g1, q5: g2
-          "vst1.32    {d2-d3}, [%[gw_ptr]]!         \n"
-          "vadd.f32   q9, q1, q5                    \n"
-          "vadd.f32   q10, q9, q3                   \n"
-          "vsub.f32   q11, q9, q3                   \n"
-          "vmul.f32   q10, q10, d0[1]               \n"
-          "vst1.32    {d20-d21}, [%[gw_ptr]]!       \n"
-          "vmul.f32   q11, q11, d0[1]               \n"
-          "vst1.32    {d22-d23}, [%[gw_ptr]]!       \n"
-
-          "vmul.f32   q9, q1, d0[0]                 \n"
-          "vmul.f32   q9, q9, d0[0]                 \n"  // 4 * g0
-          "vmul.f32   q10, q3, d0[0]                \n"  // 2 * g1
-          "vmul.f32   q11, q5, d0[0]                \n"
-          "vmul.f32   q11, q11, d0[0]               \n"  // 4 * g2
-
-          "vadd.f32   q12, q1, q11                  \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[0]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vadd.f32   q12, q5, q9                   \n"
-          "vadd.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-          "vsub.f32   q13, q12, q10                 \n"
-          "vmul.f32   q13, q13, d1[1]               \n"
-          "vst1.32    {d26-d27}, [%[gw_ptr]]!       \n"
-
-          "vst1.32    {d10-d11}, [%[gw_ptr]]!       \n"
-          : [gw_ptr] "+r"(gw_ptr), [inptr0] "+r"(inptr0), [inptr1] "+r"(inptr1),
-            [inptr2] "+r"(inptr2), [inptr3] "+r"(inptr3)
-          : [tm_ptr] "r"((float *)transform_matrix)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-            "q8", "q9", "q10", "q11", "q12", "q13", "r0");
-
-      float *gw_ptr0 = gw;
-      float *gw_ptr1 = gw + 32;
-      float *gw_ptr2 = gw + 64;
-      float *outptr0 = outptr + (ic << 2);            // ic * 4
-      int steps = (in_channel << 2) * sizeof(float);  // in_channel * 4
-      asm volatile(
-          "vld1.32    {d0-d1}, [%[tm_ptr]]               \n"
-          "mov        r0, #8                             \n"
-
-          "loop_8_%=:                                    \n"
-          "vld1.32    {d2-d3}, [%[gw_ptr0]]!             \n"
-          "vld1.32    {d4-d5}, [%[gw_ptr1]]!             \n"
-          "vld1.32    {d6-d7}, [%[gw_ptr2]]!             \n"
-
-          // q1: g0, q2: g1, q3: g2
-          "vst1.32    {d2-d3}, [%[outptr0]], %[steps]    \n"
-          "vadd.f32   q9, q1, q3                         \n"
-          "vadd.f32   q10, q9, q2                        \n"
-          "vsub.f32   q11, q9, q2                        \n"
-          "vmul.f32   q10, q10, d0[1]                    \n"
-          "vst1.32    {d20-d21}, [%[outptr0]], %[steps]  \n"
-          "vmul.f32   q11, q11, d0[1]                    \n"
-          "vst1.32    {d22-d23}, [%[outptr0]], %[steps]  \n"
-
-          "vmul.f32   q9, q1, d0[0]                      \n"
-          "vmul.f32   q9, q9, d0[0]                      \n"  // 4 * g0
-          "vmul.f32   q10, q2, d0[0]                     \n"  // 2 * g1
-          "vmul.f32   q11, q3, d0[0]                     \n"
-          "vmul.f32   q11, q11, d0[0]                    \n"  // 4 * g2
-
-          "vadd.f32   q12, q1, q11                       \n"
-          "vadd.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[0]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-          "vsub.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[0]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-
-          // w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
-          "vadd.f32   q12, q3, q9                        \n"
-          "vadd.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[1]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-          "vsub.f32   q13, q12, q10                      \n"
-          "vmul.f32   q13, q13, d1[1]                    \n"
-          "vst1.32    {d26-d27}, [%[outptr0]], %[steps]  \n"
-
-          "vst1.32    {d6-d7}, [%[outptr0]], %[steps]    \n"
-
-          "subs       r0, #1                             \n"
-          "bne        loop_8_%=                          \n"
-          : [outptr0] "+r"(outptr0), [gw_ptr0] "+r"(gw_ptr0),
-            [gw_ptr1] "+r"(gw_ptr1), [gw_ptr2] "+r"(gw_ptr2)
-          : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q9", "q10", "q11", "q12",
-            "q13", "r0");
-    }
-  }
-#endif  // __aarch64__
-
-  // remain output channel
-  #pragma omp parallel for
-  for (int oc = remain_start; oc < out_channel; ++oc) {
-    float gw[3][8];                                     // gw[3][8]
-    const float *inptr0 = inptr + oc * in_channel * 9;  //
-    // (oc / 4) * 64 * in_channel * 4 + oc % 4
-    int offset = ((oc & 0xFFFFFFFC) << 6) * in_channel + (oc & 0x3);
-    int steps = (in_channel << 2);  // in_channel * 4
-    float *outptr = trans_outptr + offset;
-    for (int ic = 0; ic < in_channel; ++ic) {
-      for (int i = 0; i < 3; ++i, inptr0 += 3) {
-        float g0 = inptr0[0];
-        float g1 = inptr0[1];
-        float g2 = inptr0[2];
-        float d0 = g0 + g2;
-        float d1 = g0 + 4 * g2;
-        float d2 = g2 + 4 * g0;
-        float d3 = 2 * g1;
-        gw[i][0] = g0;
-        gw[i][1] = -2.f / 9 * (d0 + g1);   // -2.f/9 * (g0 + g1 + g2)
-        gw[i][2] = -2.f / 9 * (d0 - g1);   // -2.f/9 * (g0 - g1 + g2)
-        gw[i][3] = 1.f / 90 * (d1 + d3);   // 1.f/90 * (g0 + 2 * g1 + 4 * g2)
-        gw[i][4] = 1.f / 90 * (d1 - d3);   // 1.f/90 * (g0 - 2 * g1 + 4 * g2)
-        gw[i][5] = 1.f / 180 * (d2 + d3);  // 1.f/180 * (4 * g0 + 2 * g1 + g2)
-        gw[i][6] = 1.f / 180 * (d2 - d3);  // 1.f/180 * (4 * g0 - 2 * g1 + g2)
-        gw[i][7] = g2;
-      }
-      for (int i = 0; i < 8; ++i) {
-        float g0 = gw[0][i];
-        float g1 = gw[1][i];
-        float g2 = gw[2][i];
-        float d0 = g0 + g2;
-        float d1 = g0 + 4 * g2;
-        float d2 = g2 + 4 * g0;
-        float d3 = 2 * g1;
-        int offset = i * 8 * steps;
-        outptr[offset] = g0;
-        outptr[offset + 1 * steps] = -2.f / 9 * (d0 + g1);
-        outptr[offset + 2 * steps] = -2.f / 9 * (d0 - g1);
-        outptr[offset + 3 * steps] = 1.f / 90 * (d1 + d3);
-        outptr[offset + 4 * steps] = 1.f / 90 * (d1 - d3);
-        outptr[offset + 5 * steps] = 1.f / 180 * (d2 + d3);
-        outptr[offset + 6 * steps] = 1.f / 180 * (d2 - d3);
-        outptr[offset + 7 * steps] = g2;
-      }
-      outptr += 4;
-    }
-  }
-}
-
-template <>
-void winograd_transform_input<8, 3>(const framework::Tensor &input,
-                                    framework::Tensor *output) {
-  /*
-   * x0 = (d0 - d6) + (d4 - d2) * 5.25
-   * x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
-   * x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
-   * x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
-   * x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
-   * x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
-   * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
-   * x7 = (d7 - d1) + (d3 - d5) * 5.25
-   */
-  // package input into [roundup(tiles/8), 64, channel, 8] tiles
-  int channel = input.dims()[1];
-  int height = input.dims()[2];
-  int width = input.dims()[3];
-  int h_tiles = (height + 3) / 6;  // (height - 2 + 5) / 6
-  int w_tiles = (width + 3) / 6;   // (width - 2 + 5) / 6
-  int tiles = (h_tiles * w_tiles + 7) / 8;
-  framework::DDim transformed_shape =
-      framework::make_ddim(std::vector<int>{tiles, 64, channel, 8});
-  float *outptr = output->mutable_data<float>(transformed_shape);
-  memset(outptr, 0, output->numel() * sizeof(float));
-
-  const float *inptr = input.data<float>();
-  height = h_tiles * 6 + 2;
-  width = w_tiles * 6 + 2;
-  framework::Tensor input_pad;
-  if (height > input.dims()[2] || width > input.dims()[3]) {
-    framework::DDim input_shape =
-        framework::make_ddim(std::vector<int>{1, channel, height, width});
-    PadFunctor<CPU, float> pad;
-    inptr = input_pad.mutable_data<float>(input_shape);
-    pad(input, 0, height - input.dims()[2], 0, width - input.dims()[3],
-        &input_pad);
-  }
-  size_t image_size = height * width;
-  const float transform_matrix[8] = {5.25f, -5.f,   -4.25f, -2.5f,
-                                     2.f,   -1.25f, 0.5f,   0.25f};
-  #pragma omp parallel for
-  for (int c = 0; c < channel; ++c) {
-    const float *in = inptr + c * image_size;
-    float d_bt[64];  // d * B_t
-    for (int h = 0; h < h_tiles; ++h) {
-      for (int w = 0; w < w_tiles; ++w) {
-        const float *in0 = in + (h * width + w) * 6;
-        const float *in1 = in0 + width;
-        const float *in2 = in1 + width;
-        const float *in3 = in2 + width;
-        float *d_bt_ptr = d_bt;
-#if __aarch64__
-        int steps = 4 * width;
-        float32x4_t _q0 = vld1q_f32(transform_matrix);
-        float32x4_t _q1 = vld1q_f32(transform_matrix + 4);
-        for (int l = 0; l < 2; ++l) {
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(in0);
-          _q45.val[0] = vld1q_f32(in0 + 4);
-          _q23.val[1] = vld1q_f32(in1);
-          _q45.val[1] = vld1q_f32(in1 + 4);
-          _q67.val[0] = vld1q_f32(in2);
-          _q89.val[0] = vld1q_f32(in2 + 4);
-          _q67.val[1] = vld1q_f32(in3);
-          _q89.val[1] = vld1q_f32(in3 + 4);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q67.val[0]));
-          float32x4_t _q4 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q67.val[1]));
-          float32x4_t _q3 = vcombine_f32(vget_low_f32(_q45.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q5 = vcombine_f32(vget_low_f32(_q45.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q6 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q67.val[0]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q67.val[1]));
-          float32x4_t _q7 = vcombine_f32(vget_high_f32(_q45.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q9 = vcombine_f32(vget_high_f32(_q45.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q10 = vsubq_f32(_q2, _q7);
-          float32x4_t _q11 = vsubq_f32(_q3, _q6);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_f32(d_bt_ptr, _q10);
-
-          _q10 = vaddq_f32(_q6, _q7);
-          _q11 = vaddq_f32(_q4, _q5);
-          _q10 = vmlaq_lane_f32(_q10, _q3, vget_high_f32(_q0), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 0);
-          float32x4_t _q12 = vaddq_f32(_q10, _q11);
-          float32x4_t _q13 = vsubq_f32(_q10, _q11);
-          vst1q_f32(d_bt_ptr + 4, _q12);
-          vst1q_f32(d_bt_ptr + 8, _q13);
-
-          _q10 = vmulq_lane_f32(_q6, vget_high_f32(_q1), 1);
-          _q11 = vmulq_lane_f32(_q4, vget_high_f32(_q1), 0);
-          _q10 = vaddq_f32(_q10, _q7);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q3, vget_low_f32(_q1), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 1);
-          _q12 = vaddq_f32(_q10, _q11);
-          _q13 = vsubq_f32(_q10, _q11);
-          vst1q_f32(d_bt_ptr + 12, _q12);
-          vst1q_f32(d_bt_ptr + 16, _q13);
-
-          _q10 = vmulq_lane_f32(_q6, vget_low_f32(_q1), 0);
-          _q11 = vmulq_lane_f32(_q4, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q3, vget_high_f32(_q0), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q8, vget_high_f32(_q0), 1);
-          _q10 = vmlaq_lane_f32(_q10, _q7, vget_high_f32(_q1), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q1), 0);
-          _q10 = vmulq_lane_f32(_q10, vget_low_f32(_q1), 0);
-          _q12 = vaddq_f32(_q10, _q11);
-          _q13 = vsubq_f32(_q10, _q11);
-          vst1q_f32(d_bt_ptr + 20, _q12);
-          vst1q_f32(d_bt_ptr + 24, _q13);
-
-          _q10 = vsubq_f32(_q9, _q4);
-          _q11 = vsubq_f32(_q8, _q5);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_f32(d_bt_ptr + 28, _q10);
-
-          in0 += steps;
-          in1 += steps;
-          in2 += steps;
-          in3 += steps;
-          d_bt_ptr += 32;
-        }
-#else
-        int steps = 4 * width * sizeof(float);
-        asm volatile(
-            "vld1.32    {d0-d3}, [%[tm_ptr]]            \n"
-            "mov        r0, #2                          \n"
-            // row loop
-            "loop_r_%=:                                 \n"
-            "vld1.32    {d4-d7}, [%[in0]], %[steps]     \n"
-            "vld1.32    {d8-d11}, [%[in1]], %[steps]    \n"
-            "vld1.32    {d12-d15}, [%[in2]], %[steps]   \n"
-            "vld1.32    {d16-d19}, [%[in3]], %[steps]   \n"
-            "vtrn.32    q2, q4                          \n"  // d0: q2
-            "vtrn.32    q3, q5                          \n"  // d1: q4
-            "vtrn.32    q6, q8                          \n"  // d2: q6
-            "vtrn.32    q7, q9                          \n"  // d3: q8
-            "vswp.32    d5, d12                         \n"  // d4: q3
-            "vswp.32    d9, d16                         \n"  // d5: q5
-            "vswp.32    d7, d14                         \n"  // d6: q7
-            "vswp.32    d11, d18                        \n"  // d7: q9
-
-            "vsub.f32   q10, q2, q7                     \n"
-            "vsub.f32   q11, q3, q6                     \n"
-            "vmla.f32   q10, q11, d0[0]                 \n"  // d0 - d6 + (d4 -
-                                                             // d2) * 5.25"
-            "vst1.32    {d20-d21}, [%[d_bt]]!           \n"
-
-            "vadd.f32   q10, q6, q7                     \n"
-            "vadd.f32   q11, q4, q5                     \n"
-            "vmla.f32   q10, q3, d1[0]                  \n"  // d2 - 4.25 * d4 +
-                                                             // d6
-            "vmla.f32   q11, q8, d1[0]                  \n"  // d1 - 4.25 * d3 +
-                                                             // d5
-            "vadd.f32   q12, q10, q11                   \n"
-            "vsub.f32   q13, q10, q11                   \n"
-            "vst1.32    {d24-d27}, [%[d_bt]]!           \n"
-
-            "vmul.f32   q10, q6, d3[1]                  \n"  // 0.25 * d2
-            "vmul.f32   q11, q4, d3[0]                  \n"  // 0.5 * d1
-            "vadd.f32   q10, q10, q7                    \n"  // 0.25 * d2 + d6
-            "vmla.f32   q11, q5, d2[0]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5
-            "vmla.f32   q10, q3, d2[1]                  \n"  // 0.25 * d2 + d6
-                                                             // - 1.25 * d4
-            "vmla.f32   q11, q8, d1[1]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5 - 2.5 * d3
-            "vadd.f32   q12, q10, q11                   \n"
-            "vsub.f32   q13, q10, q11                   \n"
-            "vst1.32    {d24-d27}, [%[d_bt]]!           \n"
-
-            "vmul.f32   q10, q6, d2[0]                  \n"  // 2 * d2
-            "vmul.f32   q11, q4, d2[0]                  \n"  // 2 * d1
-            "vmla.f32   q10, q3, d1[1]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4
-            "vmla.f32   q11, q8, d1[1]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3
-            "vmla.f32   q10, q7, d3[0]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3 + 0.5 * d6
-            "vmla.f32   q11, q5, d3[0]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4 + 0.5 * d5
-            "vmul.f32   q10, q10, d2[0]                 \n"  // 4 * d1 - 5 * d3
-                                                             // + d6
-            "vadd.f32   q12, q10, q11                   \n"
-            "vsub.f32   q13, q10, q11                   \n"
-            "vst1.32    {d24-d27}, [%[d_bt]]!           \n"
-
-            "vsub.f32   q10, q9, q4                     \n"
-            "vsub.f32   q11, q8, q5                     \n"
-            "vmla.f32   q10, q11, d0[0]                 \n"
-            "vst1.32    {d20-d21}, [%[d_bt]]!           \n"
-
-            "subs       r0, #1                          \n"
-            "bne        loop_r_%=                       \n"
-            : [d_bt] "+r"(d_bt_ptr), [in0] "+r"(in0), [in1] "+r"(in1),
-              [in2] "+r"(in2), [in3] "+r"(in3)
-            : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "r0");
-#endif  // __aarch64__
-        float *ptr0 = d_bt;
-        float *ptr1 = ptr0 + 32;
-        int tile_indics = h * w_tiles + w;
-        int tile_block = tile_indics >> 3;
-        int block_indics = tile_indics & 0x7;
-        // (tiles / 8, 64, channel, 8)
-        float *out0 =
-            outptr + (tile_block * 64 * channel + c) * 8 + block_indics;
-        float *out1 = out0 + channel * 8;
-        float *out2 = out1 + channel * 8;
-        float *out3 = out2 + channel * 8;
-        float *out4 = out3 + channel * 8;
-        float *out5 = out4 + channel * 8;
-        float *out6 = out5 + channel * 8;
-        float *out7 = out6 + channel * 8;
-#if __aarch64__
-        steps = 8 * channel * 8;
-        for (int l = 0; l < 2; ++l) {
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(ptr0);
-          _q23.val[1] = vld1q_f32(ptr0 + 4);
-          _q45.val[0] = vld1q_f32(ptr0 + 8);
-          _q45.val[1] = vld1q_f32(ptr0 + 12);
-          _q67.val[0] = vld1q_f32(ptr1);
-          _q67.val[1] = vld1q_f32(ptr1 + 4);
-          _q89.val[0] = vld1q_f32(ptr1 + 8);
-          _q89.val[1] = vld1q_f32(ptr1 + 12);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q45.val[0]));
-          float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q45.val[0]));
-          float32x4_t _q3 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q45.val[1]));
-          float32x4_t _q5 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q45.val[1]));
-          float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q7 = vcombine_f32(vget_low_f32(_q67.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q9 = vcombine_f32(vget_high_f32(_q67.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q10 = vsubq_f32(_q2, _q8);
-          float32x4_t _q11 = vsubq_f32(_q6, _q4);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_lane_f32(out0, _q10, 0);
-          vst1q_lane_f32(out0 + steps, _q10, 1);
-          vst1q_lane_f32(out0 + 2 * steps, _q10, 2);
-          vst1q_lane_f32(out0 + 3 * steps, _q10, 3);
-
-          _q10 = vaddq_f32(_q4, _q8);
-          _q11 = vaddq_f32(_q3, _q7);
-          _q10 = vmlaq_lane_f32(_q10, _q6, vget_high_f32(_q0), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 0);
-          float32x4_t _q12 = vaddq_f32(_q10, _q11);
-          vst1q_lane_f32(out1, _q12, 0);
-          vst1q_lane_f32(out1 + steps, _q12, 1);
-          vst1q_lane_f32(out1 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out1 + 3 * steps, _q12, 3);
-
-          _q12 = vsubq_f32(_q10, _q11);
-          vst1q_lane_f32(out2, _q12, 0);
-          vst1q_lane_f32(out2 + steps, _q12, 1);
-          vst1q_lane_f32(out2 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out2 + 3 * steps, _q12, 3);
-
-          _q10 = vmulq_lane_f32(_q4, vget_high_f32(_q1), 1);
-          _q11 = vmulq_lane_f32(_q3, vget_high_f32(_q1), 0);
-          _q10 = vaddq_f32(_q10, _q8);
-          _q11 = vmlaq_lane_f32(_q11, _q7, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q6, vget_low_f32(_q1), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 1);
-          _q12 = vaddq_f32(_q10, _q11);
-          vst1q_lane_f32(out3, _q12, 0);
-          vst1q_lane_f32(out3 + steps, _q12, 1);
-          vst1q_lane_f32(out3 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out3 + 3 * steps, _q12, 3);
-
-          _q12 = vsubq_f32(_q10, _q11);
-          vst1q_lane_f32(out4, _q12, 0);
-          vst1q_lane_f32(out4 + steps, _q12, 1);
-          vst1q_lane_f32(out4 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out4 + 3 * steps, _q12, 3);
-
-          _q10 = vmulq_lane_f32(_q4, vget_low_f32(_q1), 0);
-          _q11 = vmulq_lane_f32(_q3, vget_low_f32(_q1), 0);
-          _q10 = vmlaq_lane_f32(_q10, _q6, vget_high_f32(_q0), 1);
-          _q11 = vmlaq_lane_f32(_q11, _q5, vget_high_f32(_q0), 1);
-          _q10 = vmlaq_lane_f32(_q10, _q8, vget_high_f32(_q1), 0);
-          _q11 = vmlaq_lane_f32(_q11, _q7, vget_high_f32(_q1), 0);
-          _q10 = vmulq_lane_f32(_q10, vget_low_f32(_q1), 0);
-          _q12 = vaddq_f32(_q10, _q11);
-          vst1q_lane_f32(out5, _q12, 0);
-          vst1q_lane_f32(out5 + steps, _q12, 1);
-          vst1q_lane_f32(out5 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out5 + 3 * steps, _q12, 3);
-
-          _q12 = vsubq_f32(_q10, _q11);
-          vst1q_lane_f32(out6, _q12, 0);
-          vst1q_lane_f32(out6 + steps, _q12, 1);
-          vst1q_lane_f32(out6 + 2 * steps, _q12, 2);
-          vst1q_lane_f32(out6 + 3 * steps, _q12, 3);
-
-          _q10 = vsubq_f32(_q9, _q3);
-          _q11 = vsubq_f32(_q5, _q7);
-          _q10 = vmlaq_lane_f32(_q10, _q11, vget_low_f32(_q0), 0);
-          vst1q_lane_f32(out7, _q10, 0);
-          vst1q_lane_f32(out7 + steps, _q10, 1);
-          vst1q_lane_f32(out7 + 2 * steps, _q10, 2);
-          vst1q_lane_f32(out7 + 3 * steps, _q10, 3);
-
-          ptr0 += 16;
-          ptr1 += 16;
-          out0 += 4 * steps;
-          out1 += 4 * steps;
-          out2 += 4 * steps;
-          out3 += 4 * steps;
-          out4 += 4 * steps;
-          out5 += 4 * steps;
-          out6 += 4 * steps;
-          out7 += 4 * steps;
-        }
-#else
-        steps = 8 * channel * 8 * sizeof(float);
-        asm volatile(
-            "mov        r0, #2                          \n"
-            "vld1.32    {d0-d3}, [%[tm_ptr]]            \n"
-            // row loop
-            "loop_r_%=:                                 \n"
-            "vld1.32    {d4-d7}, [%[ptr0]]!             \n"  // q2: d0, q3: d1
-            "vld1.32    {d8-d11}, [%[ptr0]]!            \n"  // q4: d2, q5: d3
-            "vld1.32    {d12-d15}, [%[ptr1]]!           \n"  // q6: d4, q7: d5
-            "vld1.32    {d16-d19}, [%[ptr1]]!           \n"  // q8: d6, q9: d7
-            "vtrn.32    q2, q3                          \n"
-            "vtrn.32    q4, q5                          \n"
-            "vtrn.32    q6, q7                          \n"
-            "vtrn.32    q8, q9                          \n"
-            "vswp.32    d5, d8                          \n"
-            "vswp.32    d7, d10                         \n"
-            "vswp.32    d13, d16                        \n"
-            "vswp.32    d15, d18                        \n"
-
-            "vsub.f32   q10, q2, q8                     \n"  // d0 - d6
-            "vsub.f32   q11, q6, q4                     \n"  // d4 - d2
-            "vmla.f32   q10, q11, d0[0]                 \n"  // d0 - d6 + (d4 -
-                                                             // d2) * 5.25
-            "vst1.32    {d20[0]}, [%[out0]], %[steps]   \n"
-            "vst1.32    {d20[1]}, [%[out0]], %[steps]   \n"
-            "vst1.32    {d21[0]}, [%[out0]], %[steps]   \n"
-            "vst1.32    {d21[1]}, [%[out0]], %[steps]   \n"
-
-            "vadd.f32   q10, q4, q8                     \n"
-            "vadd.f32   q11, q3, q7                     \n"
-            "vmla.f32   q10, q6, d1[0]                  \n"  // d2 - 4.25 * d4 +
-                                                             // d6
-            "vmla.f32   q11, q5, d1[0]                  \n"  // d1 - 4.25 * d3 +
-                                                             // d5
-            "vadd.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out1]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out1]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out1]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out1]], %[steps]   \n"
-            "vsub.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out2]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out2]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out2]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out2]], %[steps]   \n"
-
-            "vmul.f32   q10, q4, d3[1]                  \n"  // 0.25 * d2
-            "vmul.f32   q11, q3, d3[0]                  \n"  // 0.5 * d1
-            "vadd.f32   q10, q10, q8                    \n"  // 0.25 * d2 + d6
-            "vmla.f32   q11, q7, d2[0]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5
-            "vmla.f32   q10, q6, d2[1]                  \n"  // 0.25 * d2 + d6
-                                                             // - 1.25 * d4
-            "vmla.f32   q11, q5, d1[1]                  \n"  // 0.5 * d1 + 2 *
-                                                             // d5 - 2.5 * d3
-            "vadd.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out3]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out3]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out3]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out3]], %[steps]   \n"
-            "vsub.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out4]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out4]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out4]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out4]], %[steps]   \n"
-
-            "vmul.f32   q10, q4, d2[0]                  \n"  // 2 * d2
-            "vmul.f32   q11, q3, d2[0]                  \n"  // 2 * d1
-            "vmla.f32   q10, q6, d1[1]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4
-            "vmla.f32   q11, q5, d1[1]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3
-            "vmla.f32   q10, q8, d3[0]                  \n"  // 2 * d1 - 2.5 *
-                                                             // d3 + 0.5 * d6
-            "vmla.f32   q11, q7, d3[0]                  \n"  // 2 * d2 - 2.5 *
-                                                             // d4 + 0.5 * d5
-            "vmul.f32   q10, q10, d2[0]                 \n"  // 4 * d1 - 5 * d3
-                                                             // + d6
-            "vadd.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out5]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out5]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out5]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out5]], %[steps]   \n"
-            "vsub.f32   q12, q10, q11                   \n"
-            "vst1.32    {d24[0]}, [%[out6]], %[steps]   \n"
-            "vst1.32    {d24[1]}, [%[out6]], %[steps]   \n"
-            "vst1.32    {d25[0]}, [%[out6]], %[steps]   \n"
-            "vst1.32    {d25[1]}, [%[out6]], %[steps]   \n"
-
-            "vsub.f32   q10, q9, q3                     \n"
-            "vsub.f32   q11, q5, q7                     \n"
-            "vmla.f32   q10, q11, d0[0]                 \n"
-            "vst1.32    {d20[0]}, [%[out7]], %[steps]   \n"
-            "vst1.32    {d20[1]}, [%[out7]], %[steps]   \n"
-            "vst1.32    {d21[0]}, [%[out7]], %[steps]   \n"
-            "vst1.32    {d21[1]}, [%[out7]], %[steps]   \n"
-
-            "subs       r0, #1                          \n"
-            "bne        loop_r_%=                       \n"
-            : [out0] "+r"(out0), [out1] "+r"(out1), [out2] "+r"(out2),
-              [out3] "+r"(out3), [out4] "+r"(out4), [out5] "+r"(out5),
-              [out6] "+r"(out6), [out7] "+r"(out7), [ptr0] "+r"(ptr0),
-              [ptr1] "+r"(ptr1)
-            : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "r0");
-#endif  // __aarch64__
-      }
-    }
-  }
-}
-
-template <>
-void winograd_transform_output<8, 3>(const framework::Tensor &input,
-                                     const framework::Tensor &weight,
-                                     framework::Tensor *output) {
-  // weight shape is [out_channel/4, 64, in_channel, 4],
-  // input shape is [hw/8, 64, in_channel, 8]
-  int tiles = input.dims()[0];
-  int in_channel = input.dims()[2];
-  int out_channel = weight.dims()[0];
-
-  // compute U*V first
-  framework::Tensor uv_trans;
-  framework::DDim shape =
-      framework::make_ddim(std::vector<int>{out_channel, tiles, 64, 32});
-  float *uv_trans_ptr = uv_trans.mutable_data<float>(shape);
-  const float *input_ptr = input.data<float>();
-  const float *weight_ptr = weight.data<float>();
-
-  #pragma omp parallel for
-  for (int i = 0; i < out_channel; ++i) {
-    float *uv_ptr = uv_trans_ptr + (i * tiles * 64 * 32);
-    for (int j = 0; j < tiles; ++j) {
-      for (int k = 0; k < 64; ++k) {
-        const float *w_ptr = weight_ptr + (i * 64 + k) * in_channel * 4;
-        const float *in_ptr = input_ptr + (j * 64 + k) * in_channel * 8;
-        int inter_channel = in_channel >> 1;
-        int remain_channel = in_channel & 0x1;
-#if __aarch64__
-        asm volatile(
-            "dup        v8.4s,     wzr                 \n"
-            "dup        v9.4s,     wzr                 \n"
-            "dup        v10.4s,    wzr                 \n"
-            "dup        v11.4s,    wzr                 \n"
-            "dup        v12.4s,    wzr                 \n"
-            "dup        v13.4s,    wzr                 \n"
-            "dup        v14.4s,    wzr                 \n"
-            "dup        v15.4s,    wzr                 \n"
-
-            "cmp        %[inter], #0                       \n"
-            "ble        2f                                 \n"
-            // loop 2 channels
-            "1:                                            \n"
-            "ld1        {v0.4s, v1.4s}, [%[w_ptr]], #32    \n"
-            "ld1        {v2.4s, v3.4s}, [%[in_ptr]], #32   \n"
-            "ld1        {v4.4s, v5.4s}, [%[in_ptr]], #32   \n"
-
-            "fmla       v8.4s, v2.4s, v0.s[0]              \n"
-            "fmla       v9.4s, v3.4s, v0.s[0]              \n"
-            "fmla       v10.4s, v2.4s, v0.s[1]             \n"
-            "fmla       v11.4s, v3.4s, v0.s[1]             \n"
-            "fmla       v12.4s, v2.4s, v0.s[2]             \n"
-            "fmla       v13.4s, v3.4s, v0.s[2]             \n"
-            "fmla       v14.4s, v2.4s, v0.s[3]             \n"
-            "fmla       v15.4s, v3.4s, v0.s[3]             \n"
-
-            "fmla       v8.4s, v4.4s, v1.s[0]              \n"
-            "fmla       v9.4s, v5.4s, v1.s[0]              \n"
-            "fmla       v10.4s, v4.4s, v1.s[1]             \n"
-            "fmla       v11.4s, v5.4s, v1.s[1]             \n"
-            "fmla       v12.4s, v4.4s, v1.s[2]             \n"
-            "fmla       v13.4s, v5.4s, v1.s[2]             \n"
-            "fmla       v14.4s, v4.4s, v1.s[3]             \n"
-            "fmla       v15.4s, v5.4s, v1.s[3]             \n"
-
-            "subs       %[inter], %[inter], #1             \n"
-            "bne        1b                                 \n"
-
-            // loop 1 channel
-            "2:                                            \n"
-            "cmp        %[remain], #0                      \n"
-            "ble        3f                                 \n"
-
-            "ld1        {v0.4s, v1.4s}, [%[w_ptr]], #32    \n"
-            "ld1        {v2.4s, v3.4s}, [%[in_ptr]], #32   \n"
-            "fmla       v8.4s, v2.4s, v0.s[0]              \n"
-            "fmla       v9.4s, v3.4s, v0.s[0]              \n"
-            "fmla       v10.4s, v2.4s, v0.s[1]             \n"
-            "fmla       v11.4s, v3.4s, v0.s[1]             \n"
-            "fmla       v12.4s, v2.4s, v0.s[2]             \n"
-            "fmla       v13.4s, v3.4s, v0.s[2]             \n"
-            "fmla       v14.4s, v2.4s, v0.s[3]             \n"
-            "fmla       v15.4s, v3.4s, v0.s[3]             \n"
-
-            "3:                                            \n"
-            "st1        {v8.4s, v9.4s, v10.4s, v11.4s}, [%[uv_ptr]], #64 \n"
-            "st1        {v12.4s, v13.4s, v14.4s, v15.4s}, [%[uv_ptr]], #64 \n"
-            : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
-              [inter] "+r"(inter_channel)
-            : [remain] "r"(remain_channel)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-              "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
-#else
-        asm volatile(
-            "veor       q8, q8, q8                     \n"
-            "veor       q9, q9, q9                     \n"
-            "veor       q10, q10, q10                  \n"
-            "veor       q11, q11, q11                  \n"
-            "veor       q12, q12, q12                  \n"
-            "veor       q13, q13, q13                  \n"
-            "veor       q14, q14, q14                  \n"
-            "veor       q15, q15, q15                  \n"
-
-            "cmp        %[inter_channel], #0           \n"
-            "ble        loop_1c_%=                     \n"
-            // loop 2 channels
-            "loop_2c_%=:                               \n"
-            "vld1.32    {d0-d3}, [%[w_ptr]]!           \n"
-            "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
-            "vld1.32    {d8-d11}, [%[in_ptr]]!         \n"
-            "vmla.f32   q8, q2, d0[0]                  \n"
-            "vmla.f32   q9, q3, d0[0]                  \n"
-            "vmla.f32   q10, q2, d0[1]                 \n"
-            "vmla.f32   q11, q3, d0[1]                 \n"
-            "vmla.f32   q12, q2, d1[0]                 \n"
-            "vmla.f32   q13, q3, d1[0]                 \n"
-            "vmla.f32   q14, q2, d1[1]                 \n"
-            "vmla.f32   q15, q3, d1[1]                 \n"
-
-            "vmla.f32   q8, q4, d2[0]                  \n"
-            "vmla.f32   q9, q5, d2[0]                  \n"
-            "vmla.f32   q10, q4, d2[1]                 \n"
-            "vmla.f32   q11, q5, d2[1]                 \n"
-            "vmla.f32   q12, q4, d3[0]                 \n"
-            "vmla.f32   q13, q5, d3[0]                 \n"
-            "vmla.f32   q14, q4, d3[1]                 \n"
-            "vmla.f32   q15, q5, d3[1]                 \n"
-
-            "subs       %[inter_channel], #1           \n"
-            "bne        loop_2c_%=                     \n"
-
-            // loop 1 channel
-            "loop_1c_%=:                               \n"
-            "cmp        %[remain_channel], #0          \n"
-            "ble        store_res_%=                   \n"
-
-            "vld1.32    {d0-d1}, [%[w_ptr]]!           \n"
-            "vld1.32    {d4-d7}, [%[in_ptr]]!          \n"
-            "vmla.f32   q8, q2, d0[0]                  \n"
-            "vmla.f32   q9, q3, d0[0]                  \n"
-            "vmla.f32   q10, q2, d0[1]                 \n"
-            "vmla.f32   q11, q3, d0[1]                 \n"
-            "vmla.f32   q12, q2, d1[0]                 \n"
-            "vmla.f32   q13, q3, d1[0]                 \n"
-            "vmla.f32   q14, q2, d1[1]                 \n"
-            "vmla.f32   q15, q3, d1[1]                 \n"
-
-            "store_res_%=:                             \n"
-            "vst1.32    {d16-d19}, [%[uv_ptr]]!        \n"
-            "vst1.32    {d20-d23}, [%[uv_ptr]]!        \n"
-            "vst1.32    {d24-d27}, [%[uv_ptr]]!        \n"
-            "vst1.32    {d28-d31}, [%[uv_ptr]]!        \n"
-            : [w_ptr] "+r"(w_ptr), [in_ptr] "+r"(in_ptr), [uv_ptr] "+r"(uv_ptr),
-              [inter_channel] "+r"(inter_channel)
-            : [remain_channel] "r"(remain_channel)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-      }
-    }
-  }
-
-  /*
-   * s0 = m0 + (m1 + m2) +      (m3 + m4) + 32 * (m5 + m6)
-   * s1 =      (m1 - m2) +  2 * (m3 - m4) + 16 * (m5 - m6)
-   * s2 =      (m1 + m2) +  4 * (m3 + m4) +  8 * (m5 + m6)
-   * s3 =      (m1 - m2) +  8 * (m3 - m4) +  4 * (m5 - m6)
-   * s4 =      (m1 + m2) + 16 * (m3 + m4) +  2 * (m5 + m6)
-   * s5 =      (m1 - m2) + 32 * (m3 - m4) +      (m5 - m6) + m7
-   */
-  int out_h = output->dims()[2];
-  int out_w = output->dims()[3];
-  int h_tiles = (out_h + 5) / 6;
-  int w_tiles = (out_w + 5) / 6;
-  int remain_h = out_h - out_h / 6 * 6;
-  int remain_w = out_w - out_w / 6 * 6;
-  float *output_ptr = output->mutable_data<float>();
-  float transform_matrix[8] = {2.f, 4.f, 8.f, 16.f};
-
-  #pragma omp parallel for
-  for (int oc = 0; oc < output->dims()[1]; ++oc) {
-    float at_m[48];        // [6][8]
-    float output_tmp[36];  // [6][6], temporarily restore results
-    // (oc / 4) * tiles * 64 * 32 + (oc & 0x3) * 8
-    const float *uv_ptr =
-        uv_trans_ptr + (oc >> 2) * tiles * 64 * 32 + (oc & 0x3) * 8;
-    for (int tile_h = 0; tile_h < h_tiles; ++tile_h) {
-      for (int tile_w = 0; tile_w < w_tiles; ++tile_w) {
-        float *at_m_ptr = at_m;
-        int tile_indics = tile_h * w_tiles + tile_w;
-        int tile_block = tile_indics >> 3;
-        int block_indics = tile_indics & 0x7;
-        const float *uv_ptr0 = uv_ptr + tile_block * 64 * 32 + block_indics;
-#if __aarch64__
-        float32x4_t _q0 = vld1q_f32(transform_matrix);
-        for (int l = 0; l < 2; ++l) {
-          float32x4_t _q1, _q2, _q3, _q4, _q5, _q6, _q7, _q8;
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 0);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 0);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 0);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 0);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 0);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 0);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 0);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 0);
-          uv_ptr0 += 32;
-
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 1);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 1);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 1);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 1);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 1);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 1);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 1);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 1);
-          uv_ptr0 += 32;
-
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 2);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 2);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 2);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 2);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 2);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 2);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 2);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 2);
-          uv_ptr0 += 32;
-
-          _q1 = vsetq_lane_f32(*uv_ptr0, _q1, 3);
-          uv_ptr0 += 32;
-          _q3 = vsetq_lane_f32(*uv_ptr0, _q3, 3);
-          uv_ptr0 += 32;
-          _q5 = vsetq_lane_f32(*uv_ptr0, _q5, 3);
-          uv_ptr0 += 32;
-          _q7 = vsetq_lane_f32(*uv_ptr0, _q7, 3);
-          uv_ptr0 += 32;
-          _q2 = vsetq_lane_f32(*uv_ptr0, _q2, 3);
-          uv_ptr0 += 32;
-          _q4 = vsetq_lane_f32(*uv_ptr0, _q4, 3);
-          uv_ptr0 += 32;
-          _q6 = vsetq_lane_f32(*uv_ptr0, _q6, 3);
-          uv_ptr0 += 32;
-          _q8 = vsetq_lane_f32(*uv_ptr0, _q8, 3);
-          uv_ptr0 += 32;
-
-          float32x4_t _q9 = vaddq_f32(_q3, _q5);
-          float32x4_t _q10 = vaddq_f32(_q7, _q2);
-          float32x4_t _q11 = vaddq_f32(_q4, _q6);
-          float32x4_t _q12 = vsubq_f32(_q3, _q5);
-          float32x4_t _q13 = vsubq_f32(_q7, _q2);
-          float32x4_t _q14 = vsubq_f32(_q4, _q6);
-          _q2 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0);
-          _q3 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0);
-
-          float32x4_t _q15 = vaddq_f32(_q1, _q9);
-          _q15 = vaddq_f32(_q15, _q10);
-          _q15 = vmlaq_lane_f32(_q15, _q3, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr, _q15);
-
-          _q15 = vaddq_f32(_q12, _q2);
-          _q15 = vmlaq_lane_f32(_q15, _q14, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 4, _q15);
-
-          _q15 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1);
-          _q15 = vmlaq_lane_f32(_q15, _q11, vget_high_f32(_q0), 0);
-          vst1q_f32(at_m_ptr + 8, _q15);
-
-          _q15 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0);
-          _q15 = vmlaq_lane_f32(_q15, _q14, vget_low_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 12, _q15);
-
-          _q15 = vaddq_f32(_q9, _q3);
-          _q15 = vmlaq_lane_f32(_q15, _q10, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 16, _q15);
-
-          _q15 = vaddq_f32(_q12, _q8);
-          _q15 = vaddq_f32(_q15, _q14);
-          _q15 = vmlaq_lane_f32(_q15, _q2, vget_high_f32(_q0), 1);
-          vst1q_f32(at_m_ptr + 20, _q15);
-
-          at_m_ptr += 24;
-        }
-#else
-        int steps = 32 * sizeof(float);
-        asm volatile(
-            "vld1.32    {d0-d1}, [%[tm_ptr]]              \n"
-            "mov        r0, #2                            \n"
-
-            "loop_%=:                                     \n"
-            "vld1.32    {d2[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d6[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d10[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d14[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d4[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d8[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d12[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d16[0]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vld1.32    {d2[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d6[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d10[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d14[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d4[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d8[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d12[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d16[1]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vld1.32    {d3[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d7[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d11[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d15[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d5[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d9[0]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d13[0]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d17[0]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vld1.32    {d3[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d7[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d11[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d15[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d5[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d9[1]}, [%[uv_ptr0]], %[steps]   \n"
-            "vld1.32    {d13[1]}, [%[uv_ptr0]], %[steps]  \n"
-            "vld1.32    {d17[1]}, [%[uv_ptr0]], %[steps]  \n"
-
-            "vadd.f32   q9, q3, q5                     \n"  // m1 + m2
-            "vadd.f32   q10, q7, q2                    \n"  // m3 + m4
-            "vadd.f32   q11, q4, q6                    \n"  // m5 + m6
-            "vsub.f32   q12, q3, q5                    \n"  // m1 - m2
-            "vsub.f32   q13, q7, q2                    \n"  // m3 - m4
-            "vsub.f32   q14, q4, q6                    \n"  // m5 - m6
-            "vmul.f32   q2, q13, d0[0]                 \n"  // 2 * (m3 - m4)
-            "vmul.f32   q3, q11, d0[0]                 \n"  // 2 * (m5 + m6)
-
-            "vadd.f32   q15, q1, q9                    \n"
-            "vadd.f32   q15, q15, q10                  \n"
-            "vmla.f32   q15, q3, d1[1]                 \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vadd.f32   q15, q12, q2                   \n"
-            "vmla.f32   q15, q14, d1[1]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vmov.32    q15, q9                        \n"
-            "vmla.f32   q15, q10, d0[1]                \n"
-            "vmla.f32   q15, q11, d1[0]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vmov.32    q15, q12                       \n"
-            "vmla.f32   q15, q13, d1[0]                \n"
-            "vmla.f32   q15, q14, d0[1]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vadd.f32   q15, q9, q3                    \n"
-            "vmla.f32   q15, q10, d1[1]                \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "vadd.f32   q15, q12, q8                   \n"
-            "vadd.f32   q15, q15, q14                  \n"
-            "vmla.f32   q15, q2, d1[1]                 \n"
-            "vst1.32    {d30-d31}, [%[at_m_ptr]]!      \n"
-
-            "subs       r0, #1                         \n"
-            "bne        loop_%=                        \n"
-            : [uv_ptr0] "+r"(uv_ptr0), [at_m_ptr] "+r"(at_m_ptr)
-            : [tm_ptr] "r"((float *)transform_matrix), [steps] "r"(steps)
-            : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-              "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0");
-#endif  // __aarch64__
-
-        float *at_m_ptr0 = at_m;
-        float *at_m_ptr1 = at_m + 24;
-        if ((remain_w > 0 && tile_w == w_tiles - 1) ||
-            (remain_h > 0 && tile_h == h_tiles - 1)) {
-          float *out_ptr0 = output_tmp;
-          float *out_ptr1 = output_tmp + 6;
-          float *out_ptr2 = output_tmp + 12;
-          float *out_ptr3 = output_tmp + 18;
-          float *out_ptr4 = output_tmp + 24;
-          float *out_ptr5 = output_tmp + 30;
-#if __aarch64__
-          float32x4_t _q0 = vld1q_f32(transform_matrix);
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(at_m_ptr0);
-          _q23.val[1] = vld1q_f32(at_m_ptr0 + 4);
-          _q45.val[0] = vld1q_f32(at_m_ptr0 + 8);
-          _q45.val[1] = vld1q_f32(at_m_ptr0 + 12);
-          _q67.val[0] = vld1q_f32(at_m_ptr1);
-          _q67.val[1] = vld1q_f32(at_m_ptr1 + 4);
-          _q89.val[0] = vld1q_f32(at_m_ptr1 + 8);
-          _q89.val[1] = vld1q_f32(at_m_ptr1 + 12);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q1 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q45.val[0]));
-          float32x4_t _q3 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q45.val[0]));
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q45.val[1]));
-          float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q45.val[1]));
-          float32x4_t _q5 = vcombine_f32(vget_low_f32(_q67.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q7 = vcombine_f32(vget_high_f32(_q67.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q9 = vaddq_f32(_q2, _q3);
-          float32x4_t _q10 = vaddq_f32(_q4, _q5);
-          float32x4_t _q11 = vaddq_f32(_q6, _q7);
-          float32x4_t _q12 = vsubq_f32(_q2, _q3);
-          float32x4_t _q13 = vsubq_f32(_q4, _q5);
-          float32x4_t _q14 = vsubq_f32(_q6, _q7);
-          _q6 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0);
-          _q7 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0);
-
-          _q1 = vaddq_f32(_q1, _q9);
-          _q1 = vaddq_f32(_q1, _q10);
-          _q1 = vmlaq_lane_f32(_q1, _q7, vget_high_f32(_q0), 1);
-
-          _q2 = vaddq_f32(_q12, _q6);
-          _q2 = vmlaq_lane_f32(_q2, _q14, vget_high_f32(_q0), 1);
-
-          _q3 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1);
-          _q3 = vmlaq_lane_f32(_q3, _q11, vget_high_f32(_q0), 0);
-
-          _q4 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0);
-          _q4 = vmlaq_lane_f32(_q4, _q14, vget_low_f32(_q0), 1);
-
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-          vst1_f32(out_ptr0, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr0 + 2, vget_low_f32(_q45.val[0]));
-          vst1_f32(out_ptr1, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr1 + 2, vget_low_f32(_q45.val[1]));
-          vst1_f32(out_ptr2, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr2 + 2, vget_high_f32(_q45.val[0]));
-          vst1_f32(out_ptr3, vget_high_f32(_q23.val[1]));
-          vst1_f32(out_ptr3 + 2, vget_high_f32(_q45.val[1]));
-
-          _q1 = vaddq_f32(_q9, _q7);
-          _q1 = vmlaq_lane_f32(_q1, _q10, vget_high_f32(_q0), 1);
-          _q2 = vaddq_f32(_q12, _q8);
-          _q2 = vaddq_f32(_q2, _q14);
-          _q2 = vmlaq_lane_f32(_q2, _q6, vget_high_f32(_q0), 1);
-          _q23 = vtrnq_f32(_q1, _q2);
-          vst1_f32(out_ptr0 + 4, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr1 + 4, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr2 + 4, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr3 + 4, vget_high_f32(_q23.val[1]));
-
-          // remain 2 rows
-          _q1 = vld1q_f32(at_m_ptr0 + 16);
-          _q2 = vld1q_f32(at_m_ptr0 + 20);
-          _q3 = vld1q_f32(at_m_ptr1 + 16);
-          _q4 = vld1q_f32(at_m_ptr1 + 20);
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-
-          float32x2_t _d2 = vget_low_f32(_q23.val[0]);
-          float32x2_t _d3 = vget_high_f32(_q23.val[0]);
-          float32x2_t _d4 = vget_low_f32(_q23.val[1]);
-          float32x2_t _d5 = vget_high_f32(_q23.val[1]);
-          float32x2_t _d6 = vget_low_f32(_q45.val[0]);
-          float32x2_t _d7 = vget_high_f32(_q45.val[0]);
-          float32x2_t _d8 = vget_low_f32(_q45.val[1]);
-          float32x2_t _d9 = vget_high_f32(_q45.val[1]);
-
-          float32x2_t _d10 = vadd_f32(_d4, _d3);
-          float32x2_t _d11 = vadd_f32(_d5, _d6);
-          float32x2_t _d12 = vadd_f32(_d8, _d7);
-          float32x2_t _d13 = vsub_f32(_d4, _d3);
-          float32x2_t _d14 = vsub_f32(_d5, _d6);
-          float32x2_t _d15 = vsub_f32(_d8, _d7);
-          float32x2_t _d16 = vmul_lane_f32(_d14, vget_low_f32(_q0), 0);
-          float32x2_t _d17 = vmul_lane_f32(_d12, vget_low_f32(_q0), 0);
-
-          float32x2_t _d18 = vadd_f32(_d2, _d10);
-          float32x2_t _d20 = vadd_f32(_d13, _d16);
-          float32x2_t _d19 = vmla_lane_f32(_d10, _d11, vget_low_f32(_q0), 1);
-          float32x2_t _d21 = vmla_lane_f32(_d13, _d14, vget_high_f32(_q0), 0);
-          _d18 = vadd_f32(_d18, _d11);
-          _d18 = vmla_lane_f32(_d18, _d17, vget_high_f32(_q0), 1);
-          _d20 = vmla_lane_f32(_d20, _d15, vget_high_f32(_q0), 1);
-          _d19 = vmla_lane_f32(_d19, _d12, vget_high_f32(_q0), 0);
-          _d21 = vmla_lane_f32(_d21, _d15, vget_low_f32(_q0), 1);
-
-          float32x2x2_t _d18d20 = vtrn_f32(_d18, _d20);
-          float32x2x2_t _d19d21 = vtrn_f32(_d19, _d21);
-          vst1_f32(out_ptr4, _d18d20.val[0]);
-          vst1_f32(out_ptr4 + 2, _d19d21.val[0]);
-          vst1_f32(out_ptr5, _d18d20.val[1]);
-          vst1_f32(out_ptr5 + 2, _d19d21.val[1]);
-
-          _d18 = vadd_f32(_d10, _d17);
-          _d18 = vmla_lane_f32(_d18, _d11, vget_high_f32(_q0), 1);
-          _d20 = vadd_f32(_d13, _d9);
-          _d20 = vadd_f32(_d20, _d15);
-          _d20 = vmla_lane_f32(_d20, _d16, vget_high_f32(_q0), 1);
-          _d18d20 = vtrn_f32(_d18, _d20);
-          vst1_f32(out_ptr4 + 4, _d18d20.val[0]);
-          vst1_f32(out_ptr5 + 4, _d18d20.val[1]);
-#else
-          asm volatile(
-              "vld1.32    {d0-d1}, [%[tm_ptr]]          \n"
-              // process 4 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // q1: m0, q2: m1
-              "vld1.32    {d6-d9}, [%[at_m_ptr0]]!      \n"  // q3: m2, q4: m3
-              "vld1.32    {d10-d13}, [%[at_m_ptr1]]!    \n"  // q5: m4, q6: m5
-              "vld1.32    {d14-d17}, [%[at_m_ptr1]]!    \n"  // q7: m6, q8: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-              "vtrn.32    q5, q6                        \n"
-              "vtrn.32    q7, q8                        \n"
-              "vswp.32    d3, d6                        \n"
-              "vswp.32    d5, d8                        \n"
-              "vswp.32    d11, d14                      \n"
-              "vswp.32    d13, d16                      \n"
-
-              "vadd.f32   q9, q2, q3                    \n"  // m1 + m2
-              "vadd.f32   q10, q4, q5                   \n"  // m3 + m4
-              "vadd.f32   q11, q6, q7                   \n"  // m5 + m6
-              "vsub.f32   q12, q2, q3                   \n"  // m1 - m2
-              "vsub.f32   q13, q4, q5                   \n"  // m3 - m4
-              "vsub.f32   q14, q6, q7                   \n"  // m5 - m6
-              "vmul.f32   q6, q13, d0[0]                \n"  // 2 * (m3 - m4)
-              "vmul.f32   q7, q11, d0[0]                \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   q1, q1, q9                   \n"
-              "vadd.f32   q1, q1, q10                  \n"
-              "vmla.f32   q1, q7, d1[1]                \n"
-
-              "vadd.f32   q2, q12, q6                  \n"
-              "vmla.f32   q2, q14, d1[1]               \n"
-
-              "vmov.32    q3, q9                       \n"
-              "vmla.f32   q3, q10, d0[1]               \n"
-              "vmla.f32   q3, q11, d1[0]               \n"
-
-              "vmov.32    q4, q12                      \n"
-              "vmla.f32   q4, q13, d1[0]               \n"
-              "vmla.f32   q4, q14, d0[1]               \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vtrn.32    q3, q4                       \n"
-              "vswp.32    d3, d6                       \n"
-              "vswp.32    d5, d8                       \n"
-              "vst1.32    {d2-d3}, [%[out_ptr0]]!      \n"
-              "vst1.32    {d4-d5}, [%[out_ptr1]]!      \n"
-              "vst1.32    {d6-d7}, [%[out_ptr2]]!      \n"
-              "vst1.32    {d8-d9}, [%[out_ptr3]]!      \n"
-
-              "vadd.f32   q1, q9, q7                   \n"
-              "vmla.f32   q1, q10, d1[1]               \n"
-
-              "vadd.f32   q2, q12, q8                  \n"
-              "vadd.f32   q2, q2, q14                  \n"
-              "vmla.f32   q2, q6, d1[1]                \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vst1.32    {d2}, [%[out_ptr0]]!         \n"
-              "vst1.32    {d4}, [%[out_ptr1]]!         \n"
-              "vst1.32    {d3}, [%[out_ptr2]]!         \n"
-              "vst1.32    {d5}, [%[out_ptr3]]!         \n"
-
-              // remain 2 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // d2: m0, d3: m2,
-                                                             // d4: m1, d5: m3
-              "vld1.32    {d6-d9}, [%[at_m_ptr1]]!      \n"  // d6: m4, d7: m6,
-                                                             // d8: m5, d9: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-
-              "vadd.f32   d10, d4, d3                   \n"  // m1 + m2
-              "vadd.f32   d11, d5, d6                   \n"  // m3 + m4
-              "vadd.f32   d12, d8, d7                   \n"  // m5 + m6
-              "vsub.f32   d13, d4, d3                   \n"  // m1 - m2
-              "vsub.f32   d14, d5, d6                   \n"  // m3 - m4
-              "vsub.f32   d15, d8, d7                   \n"  // m5 - m6
-              "vmul.f32   d16, d14, d0[0]               \n"  // 2 * (m3 - m4)
-              "vmul.f32   d17, d12, d0[0]               \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   d18, d2, d10                  \n"
-              "vadd.f32   d18, d18, d11                 \n"
-              "vmla.f32   d18, d17, d1[1]               \n"
-
-              "vadd.f32   d20, d13, d16                 \n"
-              "vmla.f32   d20, d15, d1[1]               \n"
-
-              "vmov.32    d19, d10                      \n"
-              "vmla.f32   d19, d11, d0[1]               \n"
-              "vmla.f32   d19, d12, d1[0]               \n"
-
-              "vmov.32    d21, d13                      \n"
-              "vmla.f32   d21, d14, d1[0]               \n"
-              "vmla.f32   d21, d15, d0[1]               \n"
-
-              "vtrn.32    d18, d20                      \n"
-              "vtrn.32    d19, d21                      \n"
-              "vst1.32    {d18-d19}, [%[out_ptr4]]!     \n"
-              "vst1.32    {d20-d21}, [%[out_ptr5]]!     \n"
-
-              "vadd.f32   d18, d10, d17                 \n"
-              "vmla.f32   d18, d11, d1[1]               \n"
-
-              "vadd.f32   d19, d13, d9                  \n"
-              "vadd.f32   d19, d19, d15                 \n"
-              "vmla.f32   d19, d16, d1[1]               \n"
-
-              "vtrn.32    d18, d19                      \n"
-              "vst1.32    {d18}, [%[out_ptr4]]!         \n"
-              "vst1.32    {d19}, [%[out_ptr5]]!         \n"
-              : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1),
-                [out_ptr2] "+r"(out_ptr2), [out_ptr3] "+r"(out_ptr3),
-                [out_ptr4] "+r"(out_ptr4), [out_ptr5] "+r"(out_ptr5),
-                [at_m_ptr0] "+r"(at_m_ptr0), [at_m_ptr1] "+r"(at_m_ptr1)
-              : [tm_ptr] "r"((float *)transform_matrix)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-          size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
-          float *out_ptr = output_ptr + offset;
-          int remain_row = out_h - 6 * tile_h;
-          int remain_col = out_w - 6 * tile_w;
-          remain_row = (remain_row > 6) ? 6 : remain_row;
-          remain_col = (remain_col > 6) ? 6 : remain_col;
-          for (int i = 0; i < remain_row; ++i, out_ptr += out_w) {
-            memcpy(out_ptr, output_tmp + i * 6, remain_col * sizeof(float));
-          }
-        } else {
-          size_t offset = (oc * out_h + 6 * tile_h) * out_w + 6 * tile_w;
-          float *out_ptr0 = output_ptr + offset;
-          float *out_ptr1 = out_ptr0 + out_w;
-          float *out_ptr2 = out_ptr1 + out_w;
-          float *out_ptr3 = out_ptr2 + out_w;
-          float *out_ptr4 = out_ptr3 + out_w;
-          float *out_ptr5 = out_ptr4 + out_w;
-#if __aarch64__
-          float32x4_t _q0 = vld1q_f32(transform_matrix);
-          float32x4x2_t _q23, _q45, _q67, _q89;
-          _q23.val[0] = vld1q_f32(at_m_ptr0);
-          _q23.val[1] = vld1q_f32(at_m_ptr0 + 4);
-          _q45.val[0] = vld1q_f32(at_m_ptr0 + 8);
-          _q45.val[1] = vld1q_f32(at_m_ptr0 + 12);
-          _q67.val[0] = vld1q_f32(at_m_ptr1);
-          _q67.val[1] = vld1q_f32(at_m_ptr1 + 4);
-          _q89.val[0] = vld1q_f32(at_m_ptr1 + 8);
-          _q89.val[1] = vld1q_f32(at_m_ptr1 + 12);
-          _q23 = vtrnq_f32(_q23.val[0], _q23.val[1]);
-          _q45 = vtrnq_f32(_q45.val[0], _q45.val[1]);
-          _q67 = vtrnq_f32(_q67.val[0], _q67.val[1]);
-          _q89 = vtrnq_f32(_q89.val[0], _q89.val[1]);
-          float32x4_t _q1 = vcombine_f32(vget_low_f32(_q23.val[0]),
-                                         vget_low_f32(_q45.val[0]));
-          float32x4_t _q3 = vcombine_f32(vget_high_f32(_q23.val[0]),
-                                         vget_high_f32(_q45.val[0]));
-          float32x4_t _q2 = vcombine_f32(vget_low_f32(_q23.val[1]),
-                                         vget_low_f32(_q45.val[1]));
-          float32x4_t _q4 = vcombine_f32(vget_high_f32(_q23.val[1]),
-                                         vget_high_f32(_q45.val[1]));
-          float32x4_t _q5 = vcombine_f32(vget_low_f32(_q67.val[0]),
-                                         vget_low_f32(_q89.val[0]));
-          float32x4_t _q7 = vcombine_f32(vget_high_f32(_q67.val[0]),
-                                         vget_high_f32(_q89.val[0]));
-          float32x4_t _q6 = vcombine_f32(vget_low_f32(_q67.val[1]),
-                                         vget_low_f32(_q89.val[1]));
-          float32x4_t _q8 = vcombine_f32(vget_high_f32(_q67.val[1]),
-                                         vget_high_f32(_q89.val[1]));
-
-          float32x4_t _q9 = vaddq_f32(_q2, _q3);
-          float32x4_t _q10 = vaddq_f32(_q4, _q5);
-          float32x4_t _q11 = vaddq_f32(_q6, _q7);
-          float32x4_t _q12 = vsubq_f32(_q2, _q3);
-          float32x4_t _q13 = vsubq_f32(_q4, _q5);
-          float32x4_t _q14 = vsubq_f32(_q6, _q7);
-          _q6 = vmulq_lane_f32(_q13, vget_low_f32(_q0), 0);
-          _q7 = vmulq_lane_f32(_q11, vget_low_f32(_q0), 0);
-
-          _q1 = vaddq_f32(_q1, _q9);
-          _q1 = vaddq_f32(_q1, _q10);
-          _q1 = vmlaq_lane_f32(_q1, _q7, vget_high_f32(_q0), 1);
-          _q2 = vaddq_f32(_q12, _q6);
-          _q2 = vmlaq_lane_f32(_q2, _q14, vget_high_f32(_q0), 1);
-          _q3 = vmlaq_lane_f32(_q9, _q10, vget_low_f32(_q0), 1);
-          _q3 = vmlaq_lane_f32(_q3, _q11, vget_high_f32(_q0), 0);
-          _q4 = vmlaq_lane_f32(_q12, _q13, vget_high_f32(_q0), 0);
-          _q4 = vmlaq_lane_f32(_q4, _q14, vget_low_f32(_q0), 1);
-
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-          vst1_f32(out_ptr0, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr0 + 2, vget_low_f32(_q45.val[0]));
-          vst1_f32(out_ptr1, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr1 + 2, vget_low_f32(_q45.val[1]));
-          vst1_f32(out_ptr2, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr2 + 2, vget_high_f32(_q45.val[0]));
-          vst1_f32(out_ptr3, vget_high_f32(_q23.val[1]));
-          vst1_f32(out_ptr3 + 2, vget_high_f32(_q45.val[1]));
-
-          _q1 = vaddq_f32(_q9, _q7);
-          _q1 = vmlaq_lane_f32(_q1, _q10, vget_high_f32(_q0), 1);
-          _q2 = vaddq_f32(_q12, _q8);
-          _q2 = vaddq_f32(_q2, _q14);
-          _q2 = vmlaq_lane_f32(_q2, _q6, vget_high_f32(_q0), 1);
-          _q23 = vtrnq_f32(_q1, _q2);
-          vst1_f32(out_ptr0 + 4, vget_low_f32(_q23.val[0]));
-          vst1_f32(out_ptr1 + 4, vget_low_f32(_q23.val[1]));
-          vst1_f32(out_ptr2 + 4, vget_high_f32(_q23.val[0]));
-          vst1_f32(out_ptr3 + 4, vget_high_f32(_q23.val[1]));
-
-          // remain 2 rows
-          _q1 = vld1q_f32(at_m_ptr0 + 16);
-          _q2 = vld1q_f32(at_m_ptr0 + 20);
-          _q3 = vld1q_f32(at_m_ptr1 + 16);
-          _q4 = vld1q_f32(at_m_ptr1 + 20);
-          _q23 = vtrnq_f32(_q1, _q2);
-          _q45 = vtrnq_f32(_q3, _q4);
-
-          float32x2_t _d2 = vget_low_f32(_q23.val[0]);
-          float32x2_t _d3 = vget_high_f32(_q23.val[0]);
-          float32x2_t _d4 = vget_low_f32(_q23.val[1]);
-          float32x2_t _d5 = vget_high_f32(_q23.val[1]);
-          float32x2_t _d6 = vget_low_f32(_q45.val[0]);
-          float32x2_t _d7 = vget_high_f32(_q45.val[0]);
-          float32x2_t _d8 = vget_low_f32(_q45.val[1]);
-          float32x2_t _d9 = vget_high_f32(_q45.val[1]);
-
-          float32x2_t _d10 = vadd_f32(_d4, _d3);
-          float32x2_t _d11 = vadd_f32(_d5, _d6);
-          float32x2_t _d12 = vadd_f32(_d8, _d7);
-          float32x2_t _d13 = vsub_f32(_d4, _d3);
-          float32x2_t _d14 = vsub_f32(_d5, _d6);
-          float32x2_t _d15 = vsub_f32(_d8, _d7);
-          float32x2_t _d16 = vmul_lane_f32(_d14, vget_low_f32(_q0), 0);
-          float32x2_t _d17 = vmul_lane_f32(_d12, vget_low_f32(_q0), 0);
-
-          float32x2_t _d18 = vadd_f32(_d2, _d10);
-          float32x2_t _d20 = vadd_f32(_d13, _d16);
-          float32x2_t _d19 = vmla_lane_f32(_d10, _d11, vget_low_f32(_q0), 1);
-          float32x2_t _d21 = vmla_lane_f32(_d13, _d14, vget_high_f32(_q0), 0);
-          _d18 = vadd_f32(_d18, _d11);
-          _d18 = vmla_lane_f32(_d18, _d17, vget_high_f32(_q0), 1);
-          _d20 = vmla_lane_f32(_d20, _d15, vget_high_f32(_q0), 1);
-          _d19 = vmla_lane_f32(_d19, _d12, vget_high_f32(_q0), 0);
-          _d21 = vmla_lane_f32(_d21, _d15, vget_low_f32(_q0), 1);
-
-          float32x2x2_t _d18d20 = vtrn_f32(_d18, _d20);
-          float32x2x2_t _d19d21 = vtrn_f32(_d19, _d21);
-          vst1_f32(out_ptr4, _d18d20.val[0]);
-          vst1_f32(out_ptr4 + 2, _d19d21.val[0]);
-          vst1_f32(out_ptr5, _d18d20.val[1]);
-          vst1_f32(out_ptr5 + 2, _d19d21.val[1]);
-
-          _d18 = vadd_f32(_d10, _d17);
-          _d18 = vmla_lane_f32(_d18, _d11, vget_high_f32(_q0), 1);
-          _d20 = vadd_f32(_d13, _d9);
-          _d20 = vadd_f32(_d20, _d15);
-          _d20 = vmla_lane_f32(_d20, _d16, vget_high_f32(_q0), 1);
-          _d18d20 = vtrn_f32(_d18, _d20);
-          vst1_f32(out_ptr4 + 4, _d18d20.val[0]);
-          vst1_f32(out_ptr5 + 4, _d18d20.val[1]);
-#else
-          asm volatile(
-              "vld1.32    {d0-d1}, [%[tm_ptr]]          \n"
-              // process 4 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // q1: m0, q2: m1
-              "vld1.32    {d6-d9}, [%[at_m_ptr0]]!      \n"  // q3: m2, q4: m3
-              "vld1.32    {d10-d13}, [%[at_m_ptr1]]!    \n"  // q5: m4, q6: m5
-              "vld1.32    {d14-d17}, [%[at_m_ptr1]]!    \n"  // q7: m6, q8: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-              "vtrn.32    q5, q6                        \n"
-              "vtrn.32    q7, q8                        \n"
-              "vswp.32    d3, d6                        \n"
-              "vswp.32    d5, d8                        \n"
-              "vswp.32    d11, d14                      \n"
-              "vswp.32    d13, d16                      \n"
-
-              "vadd.f32   q9, q2, q3                    \n"  // m1 + m2
-              "vadd.f32   q10, q4, q5                   \n"  // m3 + m4
-              "vadd.f32   q11, q6, q7                   \n"  // m5 + m6
-              "vsub.f32   q12, q2, q3                   \n"  // m1 - m2
-              "vsub.f32   q13, q4, q5                   \n"  // m3 - m4
-              "vsub.f32   q14, q6, q7                   \n"  // m5 - m6
-              "vmul.f32   q6, q13, d0[0]                \n"  // 2 * (m3 - m4)
-              "vmul.f32   q7, q11, d0[0]                \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   q1, q1, q9                   \n"
-              "vadd.f32   q1, q1, q10                  \n"
-              "vmla.f32   q1, q7, d1[1]                \n"
-
-              "vadd.f32   q2, q12, q6                  \n"
-              "vmla.f32   q2, q14, d1[1]               \n"
-
-              "vmov.32    q3, q9                       \n"
-              "vmla.f32   q3, q10, d0[1]               \n"
-              "vmla.f32   q3, q11, d1[0]               \n"
-
-              "vmov.32    q4, q12                      \n"
-              "vmla.f32   q4, q13, d1[0]               \n"
-              "vmla.f32   q4, q14, d0[1]               \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vtrn.32    q3, q4                       \n"
-              "vswp.32    d3, d6                       \n"
-              "vswp.32    d5, d8                       \n"
-              "vst1.32    {d2-d3}, [%[out_ptr0]]!      \n"
-              "vst1.32    {d4-d5}, [%[out_ptr1]]!      \n"
-              "vst1.32    {d6-d7}, [%[out_ptr2]]!      \n"
-              "vst1.32    {d8-d9}, [%[out_ptr3]]!      \n"
-
-              "vadd.f32   q1, q9, q7                   \n"
-              "vmla.f32   q1, q10, d1[1]               \n"
-
-              "vadd.f32   q2, q12, q8                  \n"
-              "vadd.f32   q2, q2, q14                  \n"
-              "vmla.f32   q2, q6, d1[1]                \n"
-
-              "vtrn.32    q1, q2                       \n"
-              "vst1.32    {d2}, [%[out_ptr0]]!         \n"
-              "vst1.32    {d4}, [%[out_ptr1]]!         \n"
-              "vst1.32    {d3}, [%[out_ptr2]]!         \n"
-              "vst1.32    {d5}, [%[out_ptr3]]!         \n"
-
-              // remain 2 rows
-              "vld1.32    {d2-d5}, [%[at_m_ptr0]]!      \n"  // d2: m0, d3: m2,
-                                                             // d4: m1, d5: m3
-              "vld1.32    {d6-d9}, [%[at_m_ptr1]]!      \n"  // d6: m4, d7: m6,
-                                                             // d8: m5, d9: m7
-              "vtrn.32    q1, q2                        \n"
-              "vtrn.32    q3, q4                        \n"
-
-              "vadd.f32   d10, d4, d3                   \n"  // m1 + m2
-              "vadd.f32   d11, d5, d6                   \n"  // m3 + m4
-              "vadd.f32   d12, d8, d7                   \n"  // m5 + m6
-              "vsub.f32   d13, d4, d3                   \n"  // m1 - m2
-              "vsub.f32   d14, d5, d6                   \n"  // m3 - m4
-              "vsub.f32   d15, d8, d7                   \n"  // m5 - m6
-              "vmul.f32   d16, d14, d0[0]               \n"  // 2 * (m3 - m4)
-              "vmul.f32   d17, d12, d0[0]               \n"  // 2 * (m5 + m6)
-
-              "vadd.f32   d18, d2, d10                  \n"
-              "vadd.f32   d18, d18, d11                 \n"
-              "vmla.f32   d18, d17, d1[1]               \n"
-
-              "vadd.f32   d20, d13, d16                 \n"
-              "vmla.f32   d20, d15, d1[1]               \n"
-
-              "vmov.32    d19, d10                      \n"
-              "vmla.f32   d19, d11, d0[1]               \n"
-              "vmla.f32   d19, d12, d1[0]               \n"
-
-              "vmov.32    d21, d13                      \n"
-              "vmla.f32   d21, d14, d1[0]               \n"
-              "vmla.f32   d21, d15, d0[1]               \n"
-
-              "vtrn.32    d18, d20                      \n"
-              "vtrn.32    d19, d21                      \n"
-              "vst1.32    {d18-d19}, [%[out_ptr4]]!     \n"
-              "vst1.32    {d20-d21}, [%[out_ptr5]]!     \n"
-
-              "vadd.f32   d18, d10, d17                 \n"
-              "vmla.f32   d18, d11, d1[1]               \n"
-
-              "vadd.f32   d19, d13, d9                  \n"
-              "vadd.f32   d19, d19, d15                 \n"
-              "vmla.f32   d19, d16, d1[1]               \n"
-
-              "vtrn.32    d18, d19                      \n"
-              "vst1.32    {d18}, [%[out_ptr4]]!         \n"
-              "vst1.32    {d19}, [%[out_ptr5]]!         \n"
-              : [out_ptr0] "+r"(out_ptr0), [out_ptr1] "+r"(out_ptr1),
-                [out_ptr2] "+r"(out_ptr2), [out_ptr3] "+r"(out_ptr3),
-                [out_ptr4] "+r"(out_ptr4), [out_ptr5] "+r"(out_ptr5),
-                [at_m_ptr0] "+r"(at_m_ptr0), [at_m_ptr1] "+r"(at_m_ptr1)
-              : [tm_ptr] "r"((float *)transform_matrix)
-              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
-                "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#endif  // __aarch64__
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // CONV_OP
-#endif  // __ARM_NEON__
diff --git a/mobile/src/operators/mul_op.cpp b/mobile/src/operators/mul_op.cpp
deleted file mode 100644
index b11f8f95f10db2a8a446edc5991209c0d9fe2d3a..0000000000000000000000000000000000000000
--- a/mobile/src/operators/mul_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#include "mul_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void MulOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  auto y_dims = this->param_.InputY()->dims();
-  int x_num_col_dims = this->param_.XNumColDims();
-  int y_num_col_dims = this->param_.YNumColDims();
-
-  assert(x_dims.size() > x_num_col_dims);
-  assert(y_dims.size() > y_num_col_dims);
-
-  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
-  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
-  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
-
-  assert(x_mat_dims[1] == y_mat_dims[0]);
-
-  std::vector<int64_t> output_dims;
-  output_dims.reserve(
-      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
-
-  for (int i = 0; i < x_num_col_dims; ++i) {
-    output_dims.push_back(x_dims[i]);
-  }
-
-  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
-    output_dims.push_back(y_dims[i]);
-  }
-
-  framework::DDim ddim = framework::make_ddim(output_dims);
-  this->param_.Out()->Resize(ddim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(mul, ops::MulOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(mul, ops::MulOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(mul, ops::MulOp);
-#endif
-#endif
diff --git a/mobile/src/operators/mul_op.h b/mobile/src/operators/mul_op.h
deleted file mode 100644
index b08cdbf99191df63221df67135fea584ad62f514..0000000000000000000000000000000000000000
--- a/mobile/src/operators/mul_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MUL_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/mul_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class MulOp : public framework::OperatorWithKernel<
-                  DeviceType, MulParam<DeviceType>,
-                  operators::MulKernel<DeviceType, T>> {
- public:
-  MulOp(const std::string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, MulParam<DeviceType>,
-                                      operators::MulKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/multiclass_nms_op.cpp b/mobile/src/operators/multiclass_nms_op.cpp
deleted file mode 100644
index 1dd7883c8be08628ae6d8a6981d5dc4dc48dc53b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/multiclass_nms_op.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#include "operators/multiclass_nms_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void MultiClassNMSOp<Dtype, T>::InferShape() const {
-  auto input_bboxes_dims = this->param_.InputBBoxes()->dims();
-  auto input_scores_dims = this->param_.InputScores()->dims();
-  if (input_scores_dims.size() != 3) {
-    LOG(kLOG_ERROR) << "Input Scores size must be 3";
-  }
-  if (input_bboxes_dims[2] % 4 != 0 || input_bboxes_dims[2] < 4) {
-    LOG(kLOG_ERROR) << "Input BBoxes 2nd dimension must be multiples of 4";
-  }
-  if (input_bboxes_dims[1] != input_scores_dims[2]) {
-    LOG(kLOG_ERROR) << "Predict bboxes must be equal";
-  }
-  // pre size, will change in Compute.
-  this->param_.Out()->Resize(
-      framework::make_ddim({input_bboxes_dims[1], input_bboxes_dims[2] + 2}));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(multiclass_nms, ops::MultiClassNMSOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/multiclass_nms_op.h b/mobile/src/operators/multiclass_nms_op.h
deleted file mode 100644
index bba701d81a26e348ffcca12be15a017a2edbdb1e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/multiclass_nms_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef MULTICLASSNMS_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/multiclass_nms_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class MultiClassNMSOp : public framework::OperatorWithKernel<
-                            DeviceType, MultiClassNMSParam<DeviceType>,
-                            operators::MultiClassNMSKernel<DeviceType, T>> {
- public:
-  MultiClassNMSOp(const std::string &type, const VariableNameMap &inputs,
-                  const VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, MultiClassNMSParam<DeviceType>,
-            operators::MultiClassNMSKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/nearest_interp_op.cpp b/mobile/src/operators/nearest_interp_op.cpp
deleted file mode 100644
index 8e6c9b86d6557b96bd51b1efc0bca38cdab847d0..0000000000000000000000000000000000000000
--- a/mobile/src/operators/nearest_interp_op.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#include "operators/nearest_interp_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void NearestInterpolationOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input(X) of BilinearInterOp should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output(Out) of BilinearInterOp should not be null.");
-  auto dim_x = this->param_.InputX()->dims();  // NCHW format
-  DLOG << "dim_x :" << dim_x;
-
-  bool ignore_scale = false;
-  int out_h = this->param_.OutH();
-  int out_w = this->param_.OutW();
-  if (out_h > 0 && out_w > 0) {
-    ignore_scale = true;
-  }
-  PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
-
-  if (this->param_.InputOutPutSize() != nullptr) {
-    auto out_size_dim = this->param_.InputOutPutSize()->dims();
-
-    PADDLE_MOBILE_ENFORCE(out_size_dim.size() == 1,
-                          "OutSize's dimension size must be 1");
-    PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
-  }
-
-  DLOG << "this->param_.HasScale(): " << this->param_.HasScale();
-  if (this->param_.HasScale() && !ignore_scale) {
-    const float scale = this->param_.Scale();
-    DLOG << "scale_:  " << scale;
-    std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
-                                  static_cast<int>(dim_x[2] * scale),
-                                  static_cast<int>(dim_x[3] * scale)});
-    this->param_.Out()->Resize(framework::make_ddim(dim_out));
-    DLOG << "interp -- dim_out: " << dim_out;
-
-  } else {
-    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-    this->param_.Out()->Resize(framework::make_ddim(dim_out));
-    DLOG << "interp -- dim_out: " << dim_out;
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(nearest_interp, ops::NearestInterpolationOp);
-#endif
-
-#if PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(nearest_interp, ops::NearestInterpolationOp)
-#endif
-
-#endif
diff --git a/mobile/src/operators/nearest_interp_op.h b/mobile/src/operators/nearest_interp_op.h
deleted file mode 100644
index 130de53231d53fc32c5df29e42a46458dc3136c6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/nearest_interp_op.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NEAREST_INTERP_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/nearest_interp_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class NearestInterpolationOp
-    : public framework::OperatorWithKernel<
-          DeviceType, NearestInterpolationParam<DeviceType>,
-          operators::NearestInterpolationKernel<DeviceType, T>> {
- public:
-  NearestInterpolationOp(const std::string &type, const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs,
-                         framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, NearestInterpolationParam<DeviceType>,
-            operators::NearestInterpolationKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/norm_op.cpp b/mobile/src/operators/norm_op.cpp
deleted file mode 100644
index 5541755eb03799779bc4b8f2df82ea7dc42fc203..0000000000000000000000000000000000000000
--- a/mobile/src/operators/norm_op.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#include "operators/norm_op.h"
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void NormOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
-
-  int axis = this->param_.Axis();
-  if (axis < 0) {
-    axis += x_dims.size();
-  }
-  x_dims[axis] = 1;
-  this->param_.OutputNorm()->Resize(x_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(norm, ops::NormOp);
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#endif
-
-#endif
diff --git a/mobile/src/operators/norm_op.h b/mobile/src/operators/norm_op.h
deleted file mode 100644
index 64d8e7c3ccebb66a650eac9e6b67e4dacbde7994..0000000000000000000000000000000000000000
--- a/mobile/src/operators/norm_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef NORM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/norm_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class NormOp
-    : public framework::OperatorWithKernel<DeviceType, NormParam<DeviceType>,
-                                           NormKernel<DeviceType, T>> {
- public:
-  NormOp(const string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, NormParam<DeviceType>,
-                                      NormKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/one_hot_op.cpp b/mobile/src/operators/one_hot_op.cpp
deleted file mode 100644
index 64fcc64785c7fb47ff5cd16a40149fa18a2a6fc3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/one_hot_op.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#include "operators/one_hot_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void OnehotOp<Dtype, T>::InferShape() const {
-  const auto &x_dims = this->param_.input_->dims();
-  int depth = this->param_.depth_;
-  framework::DDim out_dims(x_dims);
-  out_dims[out_dims.size() - 1] = depth;
-  this->param_.output_->Resize(out_dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    this->param_.output_->set_lod(this->param_.input_->lod());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(one_hot, ops::OnehotOp);
-#endif
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/one_hot_op.h b/mobile/src/operators/one_hot_op.h
deleted file mode 100644
index 4b7e83bf996873de087844887839055031d97f66..0000000000000000000000000000000000000000
--- a/mobile/src/operators/one_hot_op.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef ONE_HOT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/one_hot_kernel.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Onehot, OnehotParam, OnehotKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // ONE_HOT_OP
diff --git a/mobile/src/operators/op_param.cpp b/mobile/src/operators/op_param.cpp
deleted file mode 100644
index bccff4a27425e75066cdf34301f051d24e47ae25..0000000000000000000000000000000000000000
--- a/mobile/src/operators/op_param.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef CONV_OP
-template <>
-Print &operator<<(Print &printer, const ConvParam<CPU> &conv_param) {
-  printer << "parameter of conv: "
-          << "\n";
-  printer << "  stride: "
-          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
-          << "\n";
-  printer << "  paddings: "
-          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
-          << ") "
-          << "\n";
-  printer << "  dilations: "
-          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
-          << ") "
-          << "\n";
-  printer << "  groups: " << conv_param.Groups() << "\n";
-  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
-  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
-  printer << "  output dims: " << conv_param.Output()->dims();
-  return printer;
-}
-
-template class ConvParam<CPU>;
-template class ConvParam<FPGA>;
-#endif
-
-#ifdef ELEMENTWISEADD_OP
-template class ElementwiseAddParam<CPU>;
-template class ElementwiseAddParam<FPGA>;
-#endif
-
-#ifdef ELEMENTWISEMUL_OP
-template class ElementwiseMulParam<CPU>;
-template class ElementwiseMulParam<FPGA>;
-#endif
-
-#ifdef MUL_OP
-template class MulParam<CPU>;
-template class MulParam<FPGA>;
-#endif
-
-#ifdef CONCAT_OP
-template class ConcatParam<CPU>;
-template class ConcatParam<FPGA>;
-#endif
-
-#ifdef LRN_OP
-template class LrnParam<CPU>;
-template class LrnParam<FPGA>;
-#endif
-
-#ifdef FUSION_CONVADD_OP
-
-Print &operator<<(Print &printer, const FusionConvAddParam<CPU> &conv_param) {
-  printer << "parameter of conv_add: "
-          << "\n";
-  printer << "  stride: "
-          << " (" << conv_param.Strides()[0] << conv_param.Strides()[1] << ") "
-          << "\n";
-  printer << "  paddings: "
-          << " (" << conv_param.Paddings()[0] << conv_param.Paddings()[1]
-          << ") "
-          << "\n";
-  printer << "  dilations: "
-          << " (" << conv_param.Dilations()[0] << conv_param.Dilations()[1]
-          << ") "
-          << "\n";
-  printer << "  groups: " << conv_param.Groups() << "\n";
-  printer << "  input  dims: " << conv_param.Input()->dims() << "\n";
-  printer << "  filter dims: " << conv_param.Filter()->dims() << "\n";
-  printer << "  bias dims: " << conv_param.Bias()->dims() << "\n";
-  printer << "  output dims: " << conv_param.Output()->dims();
-  return printer;
-}
-
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
deleted file mode 100644
index 8ef339e82e6e173a31cc5dfc53820c68e0f44746..0000000000000000000000000000000000000000
--- a/mobile/src/operators/op_param.h
+++ /dev/null
@@ -1,3816 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "common/log.h"
-#include "common/type_define.h"
-#include "common/types.h"
-#include "framework/attribute.h"
-#include "framework/lod_tensor.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/type_trait.h"
-#include "framework/variable.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-#include "fpga/KD/context.hpp"
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#endif
-
-namespace paddle_mobile {
-namespace operators {
-
-using framework::Attribute;
-using framework::AttributeMap;
-using framework::LoDTensor;
-using framework::Scope;
-using framework::Tensor;
-using framework::Variable;
-using std::string;
-using std::vector;
-
-using framework::DtypeTensorTrait;
-
-template <typename Dtype>
-class CLImageDeleter {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  void operator()(GType *ptr) {
-#ifdef PADDLE_MOBILE_CL
-    framework::CLImage *image = dynamic_cast<framework::CLImage *>(ptr);
-    if (image) {
-      delete image;
-    }
-#endif
-  }
-};
-
-class OpParam {
- public:
-  OpParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-          const AttributeMap &attrs, Scope *scope)
-      : scope_(scope) {}
-
-  Scope *GetScope() const { return scope_; }
-  Scope *scope_ = nullptr;
-
-#ifdef PADDLE_MOBILE_FPGA_KD
-  zynqmp::Context &context() { return context_; }
-
-  zynqmp::Context context_;
-#endif
-
- protected:
-  template <typename T>
-  static T *InputH0From(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("H0", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputHiddenPrevFrom(const VariableNameMap &inputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("HiddenPrev", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputAlphaFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Alpha", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Input", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputXFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("X", inputs, scope);
-  }
-  template <typename T>
-  static T *InputOutSizeFrom(const VariableNameMap &inputs,
-                             const Scope &scope) {
-    return GetVarValue<T>("OutSize", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputWFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("W", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputIdsFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Ids", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputEmissionFrom(const VariableNameMap &inputs,
-                              const Scope &scope) {
-    return GetVarValue<T>("Emission", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputTransitionFrom(const VariableNameMap &inputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("Transition", inputs, scope);
-  }
-  template <typename T>
-  static T *InputLabelFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Label", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputXFrom1(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue1<T>("addX", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputYFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Y", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputYFrom1(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue1<T>("Y", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputZFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Z", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputBiasFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Bias", inputs, scope);
-  }
-  template <typename T>
-  static T *InputWeightFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Weight", inputs, scope);
-  }
-  template <typename T>
-  static T *InputVarianceFrom(const VariableNameMap &inputs,
-                              const Scope &scope) {
-    return GetVarValue<T>("Variance", inputs, scope);
-  }
-  template <typename T>
-  static T *InputMeanFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Mean", inputs, scope);
-  }
-  template <typename T>
-  static T *InputScaleFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Scale", inputs, scope);
-  }
-  template <typename T>
-  static T *InputImageFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Image", inputs, scope);
-  }
-  template <typename T>
-  static T *InputPriorBoxFrom(const VariableNameMap &inputs,
-                              const Scope &scope) {
-    return GetVarValue<T>("PriorBox", inputs, scope);
-  }
-  template <typename T>
-  static T *InputPriorBoxVarFrom(const VariableNameMap &inputs,
-                                 const Scope &scope) {
-    return GetVarValue<T>("PriorBoxVar", inputs, scope);
-  }
-  // LoDTensor but now use Tensor
-  template <typename T>
-  static T *InputTargetBoxFrom(const VariableNameMap &inputs,
-                               const Scope &scope) {
-    return GetVarValue<T>("TargetBox", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputBBoxesFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("BBoxes", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputScoresFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Scores", inputs, scope);
-  }
-
-  template <typename T>
-  static T *InputShapeFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Shape", inputs, scope);
-  }
-
-  template <typename T>
-  static vector<T *> InputMultiFrom(const VariableNameMap &inputs,
-                                    const Scope &scope) {
-    return GetMultiVarValue<T>("X", inputs, scope);
-  }
-
-  static vector<Variable *> InputMultiVarsFrom(const VariableNameMap &inputs,
-                                               const Scope &scope) {
-    return GetMultiVar("X", inputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBatchGateFrom(const VariableNameMap &outputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("BatchGate", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputGateFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Gate", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputViterbiPathFrom(const VariableNameMap &outputs,
-                                  const Scope &scope) {
-    return GetVarValue<T>("ViterbiPath", outputs, scope);
-  }
-  template <typename T>
-  static T *OutputBatchResetHiddenPrevFrom(const VariableNameMap &outputs,
-                                           const Scope &scope) {
-    return GetVarValue<T>("BatchResetHiddenPrev", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputResetHiddenPrevFrom(const VariableNameMap &outputs,
-                                      const Scope &scope) {
-    return GetVarValue<T>("ResetHiddenPrev", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBatchHiddenFrom(const VariableNameMap &outputs,
-                                  const Scope &scope) {
-    return GetVarValue<T>("BatchHidden", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputHiddenFrom(const VariableNameMap &outputs,
-                             const Scope &scope) {
-    return GetVarValue<T>("Hidden", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Output", outputs, scope);
-  }
-
-  static Variable *OutVarFrom(const VariableNameMap &outputs,
-                              const Scope &scope) {
-    return GetVar("Out", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Out", outputs, scope);
-  }
-
-  template <typename T>
-  static vector<T *> OutMultiFrom(const VariableNameMap &outputs,
-                                  const Scope &scope) {
-    return GetMultiVarValue<T>("Out", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputYFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Y", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputXShapeFrom(const VariableNameMap &outputs,
-                             const Scope &scope) {
-    return GetVarValue<T>("XShape", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBoxesFrom(const VariableNameMap &outputs,
-                            const Scope &scope) {
-    return GetVarValue<T>("Boxes", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputBoxFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("OutputBox", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputNormFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("Norm", outputs, scope);
-  }
-
-  template <typename T>
-  static T *OutputVariancesFrom(const VariableNameMap &outputs,
-                                const Scope &scope) {
-    return GetVarValue<T>("Variances", outputs, scope);
-  }
-
-  template <typename T>
-  static T *MidOutFrom(const VariableNameMap &outputs, const Scope &scope) {
-    return GetVarValue<T>("MidOut", outputs, scope);
-  }
-
-  template <typename T>
-  static T *FilterFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Filter", inputs, scope);
-  }
-
-  template <typename T>
-  static T *GridFrom(const VariableNameMap &inputs, const Scope &scope) {
-    return GetVarValue<T>("Grid", inputs, scope);
-  }
-
-  template <typename T>
-  static const T GetAttr(const string &key, const AttributeMap &map) {
-    PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map",
-                          key.c_str())
-    return ((Attribute)map.at(key)).Get<T>();
-  }
-  static const std::string GetStringAttr(const string &key,
-                                         const AttributeMap &map) {
-    PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map",
-                          key.c_str())
-    return ((Attribute)map.at(key)).GetString();
-  }
-
-  static const bool HasAttr(const string &key, const AttributeMap &map) {
-    return map.count(key) > 0;
-  }
-
-  static const bool HasVar(const string &key, const VariableNameMap &var_map) {
-    return var_map.count(key) > 0;
-  }
-
-  template <typename T>
-  static T *GetVarValue(const string &key, const VariableNameMap &var_map,
-                        const Scope &scope) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    if (!var_vec.empty()) {
-      auto var = scope.FindVar(var_vec[0]);
-      return var->GetMutable<T>();
-    } else {
-      return nullptr;
-    }
-  }
-
-  static Variable *GetVar(const string &key, const VariableNameMap &var_map,
-                          const Scope &scope) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    if (!var_vec.empty()) {
-      auto var = scope.FindVar(var_vec[0]);
-      return var;
-    } else {
-      return nullptr;
-    }
-  }
-
-  static std::string Getkey(const string &key, const VariableNameMap &var_map,
-                            int index) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > index,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    return var_vec[index];
-  }
-
-  template <typename T>
-  static T *GetVarValue1(const string &key, const VariableNameMap &var_map,
-                         const Scope &scope) {
-    PADDLE_MOBILE_ENFORCE(var_map.count(key) > 0,
-                          "%s is not contained in var_map", key.c_str())
-    auto var_vec = var_map.at(key);
-    if (!var_vec.empty()) {
-      auto var = scope.FindVar(var_vec[1]);
-      return var->GetMutable<T>();
-    } else {
-      return nullptr;
-    }
-  }
-
-  template <typename T>
-  static vector<T *> GetMultiVarValue(const string &key,
-                                      const VariableNameMap &var_map,
-                                      const Scope &scope) {
-    auto var_vecs = var_map.at(key);
-    assert(var_vecs.size() > 1);
-    vector<T *> var_res;
-    for (auto &var_vec : var_vecs) {
-      auto var = scope.FindVar(var_vec);
-      var_res.push_back(var->GetMutable<T>());
-    }
-    return var_res;
-  }
-
-  static vector<Variable *> GetMultiVar(const string &key,
-                                        const VariableNameMap &var_map,
-                                        const Scope &scope) {
-    auto var_vecs = var_map.at(key);
-    assert(var_vecs.size() > 1);
-    vector<Variable *> var_res;
-    for (auto &var_vec : var_vecs) {
-      auto var = scope.FindVar(var_vec);
-      var_res.push_back(var);
-    }
-    return var_res;
-  }
-};
-
-#define GET_VAR_AS_TENSOR(name, name_dict, scope) \
-  OpParam::GetVarValue<framework::Tensor>(name, name_dict, scope)
-
-#define GET_VAR_AS_LOD_TENSOR(name, name_dict, scope) \
-  OpParam::GetVarValue<framework::LoDTensor>(name, name_dict, scope)
-
-template <typename Dtype>
-class ConvParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConvParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    filter_ = OpParam::FilterFrom<GType>(inputs, *scope);
-    input_ = OpParam::InputFrom<GType>(inputs, *scope);
-    if (outputs.count("Output")) {
-      output_ = OpParam::OutputFrom<GType>(outputs, *scope);
-    }
-    strides_ = OpParam::GetAttr<vector<int>>("strides", attrs);
-    paddings_ = OpParam::GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = OpParam::GetAttr<vector<int>>("dilations", attrs);
-    groups = OpParam::GetAttr<int>("groups", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Filter() const { return filter_; }
-
-  GType *Output() const { return output_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  enum ExecMode {
-    EXEC_INVALID = 0,
-    EXEC_GEMM_FLOAT,
-    EXEC_DEPTHWISE3x3S1_FLOAT,
-    EXEC_DEPTHWISE3x3S2_FLOAT,
-    EXEC_WINOGRAD3X3_FLOAT,
-    EXEC_WINOGRAD5X5_FLOAT,
-    EXEC_DEPTHWISE5x5_FLOAT,
-    EXEC_GEMM_INT8,
-    EXEC_DEPTHWISE3x3_INT8,
-    EXEC_DEPTHWISE5x5_INT8,
-    EXEC_SLIDINGWINDOW3x3S1_FLOAT,
-    EXEC_SLIDINGWINDOW3x3S2_FLOAT,
-    EXEC_DEPTHWISE3x3_FLOAT,
-    EXEC_SLIDINGWINDOW1x1_FLOAT,
-    EXEC_SLIDINGWINDOW3x3_FLOAT,
-    EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT,
-    EXEC_SLIDINGWINDOW5x5_FLOAT,
-    EXEC_SLIDINGWINDOW7x7_FLOAT,
-    EXEC_GEMM1x1s1_FLOAT,
-    EXEC_DEPTHWISEBASIC_FLOAT,
-  };
-
-  ExecMode &ExecMode() const { return exec_mode_; }
-
-  const int &Groups() const { return groups; }
-
-#ifdef PADDLE_MOBILE_CL
-  int Offset() const { return offset_; }
-
-  int SetOffset(int in_offset) { offset_ = in_offset; }
-
-#endif
-
- public:
-  GType *input_;
-  GType *output_;
-  GType *filter_;
-  GType *transformed_filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  mutable enum ExecMode exec_mode_;
-  int groups;
-
-#ifdef PADDLE_MOBILE_CL
-  int offset_;
-#endif
-
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-
- public:
-  fpga::DWconvArgs fpga_dwconv_args;
-
- public:
-  const fpga::DWconvArgs &FpgaDwconvArgs() const { return fpga_dwconv_args; }
-  void SetFpgaArgs(const fpga::DWconvArgs &args) { fpga_dwconv_args = args; }
-#endif
-};
-template <typename Dtype>
-Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
-
-template <typename Dtype>
-class ElementwiseAddParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ElementwiseAddParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int axis_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::EWAddArgs fpga_EW_add_args;
-
- public:
-  const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; }
-  void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; }
-
- public:
-  Tensor float_input_x, float_out;
-
-#endif
-};
-
-#ifdef ELEMENTWISEMUL_OP
-template <typename Dtype>
-class ElementwiseMulParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ElementwiseMulParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int axis_;
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-  Tensor float_input_x, float_out;
-
-#endif
-};
-#endif
-
-#ifdef FUSION_ELEMENTWISEADDRELU_OP
-template <typename Dtype>
-using ElementwiseAddReluParam = ElementwiseAddParam<Dtype>;
-#endif
-
-#ifdef ELEMENTWISESUB_OP
-template <typename Dtype>
-class ElementwiseSubParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ElementwiseSubParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int axis_;
-};
-#endif
-
-#ifdef MUL_OP
-template <typename Dtype>
-class MulParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  MulParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
-    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
-  }
-
-  GType *InputX() const { return input_x_; }
-
-  GType *InputY() const { return input_y_; }
-
-  GType *Out() const { return out_; }
-
-  const int &XNumColDims() const { return x_num_col_dims_; }
-
-  const int &YNumColDims() const { return y_num_col_dims_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *out_;
-  int x_num_col_dims_;
-  int y_num_col_dims_;
-};
-#endif
-
-#ifdef CONCAT_OP
-template <typename Dtype>
-class ConcatParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConcatParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    inputs_ = InputMultiFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<int>("axis", attrs);
-    original_output_dims_size_ = out_->dims().size();
-  }
-
-  vector<GType *> Inputs() const { return inputs_; }
-
-  GType *Out() const { return out_; }
-
-  const int &Axis() const { return axis_; }
-
- public:
-  vector<GType *> inputs_;
-  GType *out_;
-  int axis_;
-  int original_output_dims_size_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::ConcatArgs fpga_concat_args;
-
- public:
-  const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; }
-  void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; }
-#endif
-};
-#endif
-
-#ifdef SUM_OP
-template <typename Dtype>
-class SumParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SumParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    inputs_vars_ = InputMultiVarsFrom(inputs, *scope);
-    out_var_ = OutVarFrom(outputs, *scope);
-    inputs_ = InputMultiFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  vector<Variable *> InputsVars() const { return inputs_vars_; }
-
-  Variable *OutVar() const { return out_var_; }
-
-  vector<GType *> Inputs() const { return inputs_; }
-
-  GType *Out() const { return out_; }
-
- private:
-  vector<Variable *> inputs_vars_;
-  Variable *out_var_;
-  vector<GType *> inputs_;
-  GType *out_;
-};
-#endif
-
-#ifdef LRN_OP
-template <typename Dtype>
-class LrnParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LrnParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    mid_out_ = MidOutFrom<GType>(outputs, *scope);
-    n_ = GetAttr<int>("n", attrs);
-    alpha_ = GetAttr<float>("alpha", attrs);
-    beta_ = GetAttr<float>("beta", attrs);
-    k_ = GetAttr<float>("k", attrs);
-    data_format_ = GetStringAttr("data_format", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  GType *MidOut() const { return mid_out_; }
-
-  const int &N() const { return n_; }
-
-  const float &Alpha() const { return alpha_; }
-
-  const float &Beta() const { return beta_; }
-
-  const float &K() const { return k_; }
-
-  const string &DataFormat() const { return data_format_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *mid_out_;
-  int n_;
-  float alpha_;
-  float beta_;
-  float k_;
-  string data_format_;
-};
-#endif
-
-#ifdef NORM_OP
-template <typename Dtype>
-class NormParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  NormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    output_norm_ = OutputNormFrom<GType>(outputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  GType *OutputNorm() const { return output_norm_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *output_norm_;
-  float epsilon_;
-  int axis_;
-};
-#endif
-
-#ifdef BATCHNORM_OP
-template <typename Dtype>
-class BatchNormParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  BatchNormParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_y_ = OutputYFrom<GType>(outputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-    momentum_ = GetAttr<float>("momentum", attrs);
-    //    is_test_ = GetAttr<bool>("is_test", attrs);
-  }
-
-  ~BatchNormParam() {}
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *OutputY() const { return output_y_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  const string &DataFormat() const { return data_format_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- private:
-  GType *input_x_;
-  GType *output_y_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  string data_format_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-};
-#endif
-
-#ifdef INSTANCENORM_OP
-template <typename Dtype>
-class InstanceNormParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  InstanceNormParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_y_ = OutputYFrom<GType>(outputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *OutputY() const { return output_y_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
- private:
-  GType *input_x_;
-  GType *output_y_;
-  float epsilon_;
-};
-#endif
-
-#ifdef FUSION_INSTANCENORM_RELU_OP
-template <typename Dtype>
-class FusionInstanceNormReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionInstanceNormReluParam(const VariableNameMap &inputs,
-                              const VariableNameMap &outputs,
-                              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    epsilon_ = GetAttr<float>("epsilon", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float epsilon_;
-};
-#endif
-
-#ifdef POOL_OP
-template <typename Dtype>
-class PoolParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PoolParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-
-    output_ = OutFrom<GType>(outputs, *scope);
-    pooling_type_ = GetStringAttr("pooling_type", attrs);
-    ksize_ = GetAttr<vector<int>>("ksize", attrs);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    ceil_mode_ = GetAttr<bool>("ceil_mode", attrs);
-    global_pooling_ = GetAttr<bool>("global_pooling", attrs);
-
-    if (HasAttr("exclusive", attrs)) {
-      exclusive_ = GetAttr<bool>("exclusive", attrs);
-    } else {
-      exclusive_ = true;
-    }
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Output() const { return output_; }
-
-  const string &PoolingType() const { return pooling_type_; }
-
-  const vector<int> &Ksize() const { return ksize_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  bool isCeilMode() const { return ceil_mode_; }
-
-  bool isGlobalPooling() const { return global_pooling_; }
-
-  bool isExclusive() const { return exclusive_; }
-
- private:
-  GType *input_;
-  GType *output_;
-  string pooling_type_;
-  vector<int> ksize_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  bool ceil_mode_;
-  bool global_pooling_ = false;
-  bool exclusive_ = true;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::PoolingArgs fpga_pool_args;
-
- public:
-  const fpga::PoolingArgs &FpgaArgs() const { return fpga_pool_args; }
-  void SetFpgaArgs(const fpga::PoolingArgs &args) { fpga_pool_args = args; }
-#endif
-};
-#endif
-
-#ifdef PRIORBOX_OP
-template <typename Dtype>
-class PriorBoxParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PriorBoxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    input_image_ = InputImageFrom<GType>(inputs, *scope);
-    output_boxes_ = OutputBoxesFrom<GType>(outputs, *scope);
-    output_variances_ = OutputVariancesFrom<GType>(outputs, *scope);
-    min_sizes_ = GetAttr<vector<float>>("min_sizes", attrs);
-    max_sizes_ = GetAttr<vector<float>>("max_sizes", attrs);
-    aspect_ratios_ = GetAttr<vector<float>>("aspect_ratios", attrs);
-    variances_ = GetAttr<vector<float>>("variances", attrs);
-
-    if (HasAttr("min_max_aspect_ratios_order", attrs)) {
-      min_max_aspect_ratios_order_ =
-          GetAttr<bool>("min_max_aspect_ratios_order", attrs);
-    } else {
-      min_max_aspect_ratios_order_ = false;
-    }
-    flip_ = GetAttr<bool>("flip", attrs);
-    clip_ = GetAttr<bool>("clip", attrs);
-    step_w_ = GetAttr<float>("step_w", attrs);
-    step_h_ = GetAttr<float>("step_h", attrs);
-    offset_ = GetAttr<float>("offset", attrs);
-  }
-  const GType *Input() const { return input_; }
-
-  const GType *InputImage() const { return input_image_; }
-
-  GType *OutputBoxes() const { return output_boxes_; }
-
-  GType *OutputVariances() const { return output_variances_; }
-
-  const vector<float> &MinSizes() const { return min_sizes_; }
-
-  const vector<float> &MaxSizes() const { return max_sizes_; }
-
-  const vector<float> &AspectRatios() const { return aspect_ratios_; }
-
-  const vector<float> &Variances() const { return variances_; }
-
-  const bool &Flip() const { return flip_; }
-
-  const bool &Clip() const { return clip_; }
-
-  const float &StepW() const { return step_w_; }
-
-  const float &StepH() const { return step_h_; }
-
-  const float &Offset() const { return offset_; }
-
-  const bool &MinMaxAspectRatiosOrder() const {
-    return min_max_aspect_ratios_order_;
-  }
-
- private:
-  GType *input_;
-  GType *input_image_;
-  GType *output_boxes_;
-  GType *output_variances_;
-  vector<float> min_sizes_;
-  vector<float> max_sizes_;
-  vector<float> aspect_ratios_;
-  vector<float> variances_;
-  bool flip_;
-  bool clip_;
-  float step_w_;
-  float step_h_;
-  float offset_;
-  bool min_max_aspect_ratios_order_;
-};
-#endif
-
-#ifdef BOXCODER_OP
-template <typename Dtype>
-class BoxCoderParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  BoxCoderParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_priorbox_ = InputPriorBoxFrom<GType>(inputs, *scope);
-    input_priorboxvar_ = InputPriorBoxVarFrom<GType>(inputs, *scope);
-    input_targetbox_ = InputTargetBoxFrom<GType>(inputs, *scope);
-    output_box_ = OutputBoxFrom<GType>(outputs, *scope);
-    code_type_ = GetStringAttr("code_type", attrs);
-  }
-  const GType *InputPriorBox() const { return input_priorbox_; }
-
-  const GType *InputPriorBoxVar() const { return input_priorboxvar_; }
-
-  const GType *InputTargetBox() const { return input_targetbox_; }
-
-  GType *OutputBox() const { return output_box_; }
-
-  const std::string &CodeType() const { return code_type_; }
-
- private:
-  GType *input_priorbox_;
-  GType *input_priorboxvar_;
-  GType *input_targetbox_;
-  GType *output_box_;
-  std::string code_type_;
-};
-#endif
-
-#ifdef SOFTMAX_OP
-template <typename Dtype>
-class SoftmaxParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SoftmaxParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    if (HasAttr("axis", attrs)) {
-      axis_ = GetAttr<int>("axis", attrs);
-      has_axis_ = true;
-    }
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
-  int axis_ = -1;
-  bool has_axis_ = false;
-
- private:
-  GType *input_x_;
-  GType *out_;
-
-#ifdef PADDLE_MOBILE_FPGA
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-
- private:
-  std::shared_ptr<GType> float_input_x_;
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  GType *FloatInput() const {
-    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
-  }
-  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#else
-
- private:
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-
- public:
-  std::shared_ptr<Tensor> float_input_x_, float_out;
-#endif
-#endif
-};
-#endif
-
-#ifdef SIGMOID_OP
-template <typename Dtype>
-class SigmoidParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SigmoidParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#endif
-};
-#endif
-
-#ifdef MULTICLASSNMS_OP
-template <typename Dtype>
-class MultiClassNMSParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  MultiClassNMSParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_bboxes_ = InputBBoxesFrom<GType>(inputs, *scope);
-    input_scores_ = InputScoresFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    background_label_ = GetAttr<int>("background_label", attrs);
-    nms_top_k_ = GetAttr<int>("nms_top_k", attrs);
-    keep_top_k_ = GetAttr<int>("keep_top_k", attrs);
-    nms_threshold_ = GetAttr<float>("nms_threshold", attrs);
-    nms_eta_ = GetAttr<float>("nms_eta", attrs);
-    score_threshold_ = GetAttr<float>("score_threshold", attrs);
-  }
-
-  GType *InputBBoxes() const { return input_bboxes_; }
-
-  GType *InputScores() const { return input_scores_; }
-
-  GType *Out() const { return out_; }
-
-  const int &BackGroundLabel() const { return background_label_; }
-
-  const int &NMSTopK() const { return nms_top_k_; }
-
-  const int &KeepTopK() const { return keep_top_k_; }
-
-  const float &NMSThreshold() const { return nms_threshold_; }
-
-  const float &NMSEta() const { return nms_eta_; }
-
-  const float &ScoreThreshold() const { return score_threshold_; }
-
- private:
-  GType *input_bboxes_;
-  GType *input_scores_;
-  GType *out_;
-  int background_label_;
-  int nms_top_k_;
-  int keep_top_k_;
-  float nms_threshold_;
-  float nms_eta_;
-  float score_threshold_;
-};
-#endif
-
-#ifdef POLYGONBOXTRANSFORM_OP
-template <typename Dtype>
-class PolygonBoxTransformParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PolygonBoxTransformParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    output_ = OutputFrom<GType>(outputs, *scope);
-  }
-  const GType *Input() const { return input_; }
-  GType *Output() const { return output_; }
-
- private:
-  GType *input_;
-  GType *output_;
-};
-#endif
-
-template <typename Dtype>
-class FeedParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<std::vector<LoDTensor>>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    col_ = GetAttr<int>("col", attrs);
-    auto var = scope->FindVar("batch_size");
-    batch_size = var->GetValue<int>();
-  }
-  const std::vector<LoDTensor> *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-  const int Col() const { return col_; }
-  const int BatchSize() const { return batch_size; }
-
- private:
-  std::vector<LoDTensor> *input_x_;
-  GType *out_;
-  int col_;
-  int batch_size;
-};
-
-template <typename Dtype>
-class FetchParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<std::vector<LoDTensor>>(outputs, *scope);
-    col_ = GetAttr<int>("col", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  std::vector<LoDTensor> *Out() const { return out_; }
-  const int Col() const { return col_; }
-
- private:
-  GType *input_x_;
-  std::vector<LoDTensor> *out_;
-  int col_;
-#ifdef PADDLE_MOBILE_FPGA
-
- public:
-#ifdef PADDLE_MOBILE_FPGA_V1
-  fpga::BypassArgs fpga_bypass_args;
-  Tensor aligned_out;
-#else
-  std::shared_ptr<Tensor> aligned_out;
-#endif
-#endif
-};
-
-#ifdef FILL_CONSTANT_OP
-template <typename Dtype>
-class FillConstantParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FillConstantParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    out_var_ = OutVarFrom(outputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    dtype_ = GetAttr<int>("dtype", attrs);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-    value_ = GetAttr<float>("value", attrs);
-  }
-
-  Variable *OutVar() const { return out_var_; }
-
-  GType *Out() const { return out_; }
-
-  const int &DataDtype() const { return dtype_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const float &Value() const { return value_; }
-
- private:
-  Variable *out_var_;
-  GType *out_;
-  int dtype_;
-  vector<int> shape_;
-  float value_;
-};
-#endif
-
-#ifdef FILL_CONSTANT_BATCH_SIZE_LIKE_OP
-template <typename Dtype>
-class FillConstantBatchSizeLikeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FillConstantBatchSizeLikeParam(const VariableNameMap &inputs,
-                                 const VariableNameMap &outputs,
-                                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    out_var_ = OutVarFrom(outputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    dtype_ = GetAttr<int>("dtype", attrs);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-    value_ = GetAttr<float>("value", attrs);
-    input_dim_idx_ = GetAttr<int>("input_dim_idx", attrs);
-    output_dim_idx_ = GetAttr<int>("output_dim_idx", attrs);
-  }
-
-  Variable *OutVar() const { return out_var_; }
-
-  const GType *Input() const { return input_; }
-
-  GType *Out() const { return out_; }
-
-  const int &DataDtype() const { return dtype_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const float &Value() const { return value_; }
-
-  int InputDimIdx() const { return input_dim_idx_; }
-
-  int OutputDimIdx() const { return output_dim_idx_; }
-
- private:
-  GType *input_;
-  Variable *out_var_;
-  GType *out_;
-  int dtype_;
-  vector<int> shape_;
-  float value_;
-  int input_dim_idx_;
-  int output_dim_idx_;
-};
-#endif
-
-#ifdef TRANSPOSE_OP
-template <typename Dtype>
-class TransposeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  TransposeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<vector<int>>("axis", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const vector<int> &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  vector<int> axis_;
-};
-#endif
-
-#ifdef TRANSPOSE2_OP
-template <typename Dtype>
-class Transpose2Param : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Transpose2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    output_xshape_ = OutputXShapeFrom<GType>(outputs, *scope);
-    axis_ = GetAttr<vector<int>>("axis", attrs);
-  }
-
-  GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  GType *OutputXShape() const { return output_xshape_; }
-
-  const vector<int> &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *output_xshape_;
-  vector<int> axis_;
-};
-#endif
-
-#ifdef LOOKUP_OP
-template <typename Dtype>
-class LookupParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LookupParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_w_ = InputWFrom<GType>(inputs, *scope);
-    input_ids_ = InputIdsFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    padding_idx_ = GetAttr<int64_t>("padding_idx", attrs);
-  }
-
-  const GType *InputW() const { return input_w_; }
-  const GType *InputIds() const { return input_ids_; }
-  GType *Out() const { return out_; }
-  int64_t PaddingIdx() const { return padding_idx_; }
-
- private:
-  GType *input_w_;
-  GType *input_ids_;
-  GType *out_;
-  int64_t padding_idx_;
-};
-#endif
-
-#ifdef CRF_OP
-template <typename Dtype>
-class CrfParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  //    {G_OP_TYPE_CRF, {{"Emission", "Transition", "Label"}, {"ViterbiPath"}}},
-
-  CrfParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    // todo crf params
-    input_emission_ = InputEmissionFrom<GType>(inputs, *scope);
-    input_transition_ = InputTransitionFrom<GType>(inputs, *scope);
-    input_label_ = InputLabelFrom<GType>(inputs, *scope);
-    output_viterbipath_ = OutputViterbiPathFrom<GType>(outputs, *scope);
-    //    padding_idx_ = GetAttr<int64_t>("padding_idx", attrs);
-  }
-  const GType *InputEmission() const { return input_emission_; }
-  const GType *InputTransition() const { return input_transition_; }
-  const GType *InputLabel() const { return input_label_; }
-  GType *outputVBP() const { return output_viterbipath_; }
-  //  const GType *InputIds() const { return input_ids_; }
-  //  GType *Out() const { return out_; }
-  //  int64_t PaddingIdx() const { return padding_idx_; }
-
- private:
-  GType *input_emission_;
-  GType *input_transition_;
-  GType *input_label_;
-  GType *output_viterbipath_;
-
-  //  GType *input_ids_;
-  //  GType *out_;
-  //  int64_t padding_idx_;
-};
-#endif
-
-#ifdef RESHAPE_OP
-template <typename Dtype>
-class ReshapeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReshapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_shape_ = InputShapeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-
-    if (HasAttr("inplace", attrs)) {
-      inplace_ = GetAttr<bool>("inplace", attrs);
-    } else {
-      inplace_ = false;
-      DLOG << "ReshapeParam lost inplace params. maybe fluid updated";
-    }
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputShape() const { return input_shape_; }
-
-  GType *Out() const { return out_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const bool &Inplace() const { return inplace_; }
-
- private:
-  GType *input_x_;
-  GType *input_shape_;
-  GType *out_;
-  vector<int> shape_;
-  bool inplace_;
-};
-#endif
-
-#ifdef RESHAPE2_OP
-template <typename Dtype>
-class Reshape2Param : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Reshape2Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_shape_ = InputShapeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    output_xshape_ = OutputXShapeFrom<GType>(outputs, *scope);
-    shape_ = GetAttr<vector<int>>("shape", attrs);
-    if (HasAttr("inplace", attrs)) {
-      inplace_ = GetAttr<bool>("inplace", attrs);
-    } else {
-      inplace_ = false;
-    }
-  }
-
-  GType *InputX() const { return input_x_; }
-
-  const GType *InputShape() const { return input_shape_; }
-
-  GType *Out() const { return out_; }
-
-  GType *OutputXShape() const { return output_xshape_; }
-
-  const vector<int> &Shape() const { return shape_; }
-
-  const bool &Inplace() const { return inplace_; }
-
- private:
-  GType *input_x_;
-  GType *input_shape_;
-  GType *out_;
-  GType *output_xshape_;
-  vector<int> shape_;
-  bool inplace_;
-};
-#endif
-
-#ifdef SCALE_OP
-template <typename Dtype>
-class ScaleParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ScaleParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    scale_ = GetAttr<float>("scale", attrs);
-    bias_ = GetAttr<float>("bias", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const float Scale() const { return scale_; }
-
-  const float Bias() const { return bias_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float scale_;
-  float bias_;
-};
-#endif
-
-#ifdef SLICE_OP
-template <typename Dtype>
-class SliceParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SliceParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-
-    axes_ = GetAttr<std::vector<int>>("axes", attrs);
-    starts_ = GetAttr<std::vector<int>>("starts", attrs);
-    ends_ = GetAttr<std::vector<int>>("ends", attrs);
-
-    original_output_dims_size_ = output_->dims().size();
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  std::vector<int> axes_;
-  std::vector<int> starts_;
-  std::vector<int> ends_;
-  int original_output_dims_size_;
-};
-#endif
-
-#ifdef RESIZE_OP
-template <typename Dtype>
-class ResizeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ResizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_shape_ = InputShapeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    is_pyramid_test_ = GetAttr<bool>("is_pyramid_test", attrs);
-    height_ = GetAttr<int>("height", attrs);
-    width_ = GetAttr<int>("width", attrs);
-    out_height_scale_ = GetAttr<float>("out_height_scale", attrs);
-    out_width_scale_ = GetAttr<float>("out_width_scale", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  const GType *InputShape() const { return input_shape_; }
-
-  GType *Out() const { return out_; }
-
-  const bool &IsPyramidTest() const { return is_pyramid_test_; }
-
-  const int &Height() const { return height_; }
-
-  const int &Width() const { return width_; }
-
-  const float &OutHeightScale() const { return out_height_scale_; }
-
-  const float &OutWidthScale() const { return out_width_scale_; }
-
- private:
-  GType *input_x_;
-  GType *input_shape_;
-  GType *out_;
-  bool is_pyramid_test_;
-  int height_;
-  int width_;
-  float out_height_scale_;
-  float out_width_scale_;
-};
-#endif
-
-#ifdef RELU_OP
-/*
- * @b op 层实例化好这个 param 传递给 kernel 层使用
- * */
-template <typename Dtype>
-class ReluParamBase : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReluParamBase(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-
-template <typename Dtype>
-class ReluParam : public ReluParamBase<Dtype> {
- public:
-  using ReluParamBase<Dtype>::ReluParamBase;
-};
-
-template <typename Dtype>
-class Relu6Param : public ReluParamBase<Dtype> {
- public:
-  Relu6Param(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : ReluParamBase<Dtype>(inputs, outputs, attrs, scope) {
-    threshold = OpParam::GetAttr<float>("threshold", attrs);
-  }
-  float getThreshold() const { return threshold; }
-
- private:
-  float threshold;
-};
-
-#ifdef PADDLE_MOBILE_CL
-template <>
-class ReluParam<GPU_CL> : public ReluParamBase<GPU_CL> {
- public:
-  using ReluParamBase<GPU_CL>::ReluParamBase;
-  framework::CLImage &getMidImage() { return midImage; }
-
- private:
-  framework::CLImage midImage;
-};
-#endif
-
-#endif
-
-#ifdef TANH_OP
-template <typename Dtype>
-class TanhParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  TanhParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  std::shared_ptr<GType> float_input_x_;
-  fpga::BypassArgs fpga_bypass_args;
-
- public:
-  GType *FloatInput() const {
-    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
-  }
-  void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); }
-  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
-  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
-#endif
-};
-#endif
-
-#ifdef PRELU_OP
-template <typename Dtype>
-class PReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    DLOG << "PReluParam inputs before";
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    alpha_ = InputAlphaFrom<GType>(inputs, *scope);
-    framework::DDim dims = alpha_->dims();
-    out_ = OutFrom<GType>(outputs, *scope);
-    mode_ = GetStringAttr("mode", attrs);
-    DLOG << "PReluParam mode after" << mode_;
-  }
-  const GType *InputX() const { return input_x_; }
-  const GType *InputAlpha() const { return alpha_; }
-  GType *Out() const { return out_; }
-  const std::string &Mode() const { return mode_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  GType *alpha_;
-  std::string mode_;
-};
-#endif
-
-#ifdef LEAKY_RELU_OP
-template <typename Dtype>
-class LeakyReluParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LeakyReluParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    alpha_ = GetAttr<float>("alpha", attrs);
-  }
-  const GType *InputX() const { return input_x_; }
-  const float Alpha() const { return alpha_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float alpha_;
-};
-#endif
-
-template <typename Dtype>
-class FusionFcParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionFcParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    input_z_ = InputZFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
-    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
-    axis_ = GetAttr<int>("axis", attrs);
-  }
-  GType *InputX() const { return input_x_; }
-
-  GType *InputY() const { return input_y_; }
-
-  GType *InputZ() const { return input_z_; }
-
-  GType *Out() const { return out_; }
-
-  const int &XNumColDims() const { return x_num_col_dims_; }
-
-  const int &YNumColDims() const { return y_num_col_dims_; }
-
-  const int &Axis() const { return axis_; }
-
- private:
-  GType *input_x_;
-  GType *input_y_;
-  GType *input_z_;
-  GType *out_;
-  int x_num_col_dims_;
-  int y_num_col_dims_;
-  int axis_;
-
-#ifdef PADDLE_MOBILE_FPGA
- private:  // NOLINT
-  fpga::SplitConvArgs fpga_conv_args;
-
- public:
-  const fpga::SplitConvArgs &FpgaArgs() const { return fpga_conv_args; }
-  void SetFpgaArgs(const fpga::SplitConvArgs &args) { fpga_conv_args = args; }
-#endif
-};
-
-#ifdef FUSION_FCRELU_OP
-template <typename DeviceType>
-using FusionFcReluParam = FusionFcParam<DeviceType>;
-#endif
-
-template <typename Dtype>
-class FusionConvAddParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-};
-
-template <typename Dtype>
-Print &operator<<(Print &printer, const FusionConvAddParam<Dtype> &conv_param);
-
-#ifdef FUSION_CONVADDRELU_OP
-template <typename DeviceType>
-class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
- public:
-  FusionConvAddReluParam(const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const AttributeMap &attrs, Scope *scope)
-      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
-};
-#endif
-
-#ifdef FUSION_CONVADDPRELU_OP
-template <typename Dtype>
-class FusionConvAddPReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddPReluParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, *scope);
-    mode_ = OpParam::GetStringAttr("mode", attrs);
-    framework::DDim dims = alpha_->dims();
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputAlpha() const { return alpha_; }
-  const std::string &Mode() const { return mode_; }
-  GType *Bias() const { return bias_; }
-  const int &Axis() const { return axis_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *alpha_;
-  std::string mode_;
-};
-#endif
-
-#ifdef FUSION_CONVADDADDPRELU_OP
-template <typename Dtype>
-class FusionConvAddAddPReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddAddPReluParam(const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias1_ = OpParam::InputYFrom1<GType>(inputs, *scope);
-    alpha_ = OpParam::InputAlphaFrom<GType>(inputs, *scope);
-    mode_ = OpParam::GetStringAttr("mode", attrs);
-    framework::DDim dims = alpha_->dims();
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    keyOutput_ = OpParam::Getkey("addOut", inputs, 0);
-    keyX1_ = OpParam::Getkey("addX", inputs, 1);
-    keyY1_ = OpParam::Getkey("Y", inputs, 1);
-    if (keyX1_ == keyOutput_) {
-      bias1_ = OpParam::InputYFrom1<GType>(inputs, *scope);
-    } else if (keyY1_ == keyOutput_) {
-      bias1_ = OpParam::InputXFrom1<GType>(inputs, *scope);
-    }
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputAlpha() const { return alpha_; }
-  const std::string &Mode() const { return mode_; }
-  const GType *Bias1() const { return bias1_; }
-
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *alpha_;
-  std::string mode_;
-  GType *bias1_;
-  std::string keyOutput_;
-  std::string keyX1_;
-  std::string keyY1_;
-};
-#endif
-
-#ifdef FUSION_CONVADDBNRELU_OP
-template <typename Dtype>
-class FusionConvAddBNReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddBNReluParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  ~FusionConvAddBNReluParam() {}
-
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-};
-#endif
-
-#ifdef FUSION_CONVBNADDRELU_OP
-template <typename Dtype>
-class FusionConvBNAddReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvBNAddReluParam(const VariableNameMap &inputs,
-                           const VariableNameMap &outputs,
-                           const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    keyBNY_ = OpParam::Getkey("BNY", inputs, 0);
-    keyX_ = OpParam::Getkey("X", inputs, 0);
-    keyY_ = OpParam::Getkey("Y", inputs, 0);
-    if (keyX_ == keyBNY_) {
-      bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    } else if (keyY_ == keyBNY_) {
-      bias_ = OpParam::InputXFrom<GType>(inputs, *scope);
-    }
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  ~FusionConvBNAddReluParam() {}
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-  std::string keyBNY_;
-  std::string keyX_;
-  std::string keyY_;
-};
-#endif
-
-#ifdef FUSION_CONVBN_OP
-template <typename Dtype>
-class FusionConvBNParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvBNParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutputYFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-};
-#endif
-
-#ifdef FUSION_CONVADDBN_OP
-template <typename Dtype>
-class FusionConvAddBNParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvAddBNParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutputYFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-};
-#endif
-
-#ifdef FUSION_DWCONVBNRELU_OP
-template <typename Dtype>
-class FusionDWConvBNReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDWConvBNReluParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  ~FusionDWConvBNReluParam() {}
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-};
-
-#endif
-
-#ifdef FUSION_CONVRELU_OP
-template <typename Dtype>
-class FusionConvReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvReluParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-};
-#endif
-
-#ifdef FUSION_CONVBNRELU_OP
-template <typename Dtype>
-class FusionConvBNReluParam : public ConvParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionConvBNReluParam(const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const AttributeMap &attrs, Scope *scope)
-      : ConvParam<Dtype>(inputs, outputs, attrs, scope) {
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    this->output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-
-  ~FusionConvBNReluParam() {}
-
-  const GType *InputBias() const { return input_bias_; }
-
-  const GType *InputMean() const { return input_mean_; }
-
-  const GType *InputScale() const { return input_scale_; }
-
-  const GType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  void SetNewScale(GType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(GType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const GType *NewScale() const { return new_scale_.get(); }
-
-  const GType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  GType *input_bias_;
-  GType *input_mean_;
-  GType *input_scale_;
-  GType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  std::shared_ptr<GType> new_bias_;
-  std::shared_ptr<GType> new_scale_;
-};
-#endif
-
-#ifdef IM2SEQUENCE_OP
-template <typename Dtype>
-class Im2SequenceParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Im2SequenceParam(const VariableNameMap &inputs,
-                   const VariableNameMap &outputs, const AttributeMap &attrs,
-                   Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    kernels_ = GetAttr<vector<int>>("kernels", attrs);
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-  }
-
-  const GType *Input() const { return input_x_; }
-
-  GType *Output() const { return out_; }
-
-  const vector<int> &Kernels() const { return kernels_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  vector<int> kernels_;
-  vector<int> strides_;
-  vector<int> paddings_;
-};
-#endif
-
-#ifdef DROPOUT_OP
-template <typename Dtype>
-class DropoutParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  DropoutParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-
-    dropout_prob_ = GetAttr<float>("dropout_prob", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  float DropoutProb() const { return dropout_prob_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  float dropout_prob_;
-};
-#endif
-
-template <typename Dtype>
-class ConvTransposeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ConvTransposeParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    filter_ = OpParam::FilterFrom<GType>(inputs, *scope);
-    input_ = OpParam::InputFrom<GType>(inputs, *scope);
-    // output_ = OutputFrom<GType>(outputs, scope);
-    if (outputs.count("Output")) {
-      output_ = OpParam::OutputFrom<GType>(outputs, *scope);
-    }
-    strides_ = GetAttr<vector<int>>("strides", attrs);
-    paddings_ = GetAttr<vector<int>>("paddings", attrs);
-    dilations_ = GetAttr<vector<int>>("dilations", attrs);
-    if (HasAttr("output_size", attrs)) {
-      output_size_ = GetAttr<vector<int>>("output_size", attrs);
-      DLOG << "conv transpose output size: " << output_size_;
-    }
-    groups = GetAttr<int>("groups", attrs);
-  }
-
-  const GType *Input() const { return input_; }
-
-  GType *Filter() const { return filter_; }
-
-  GType *Output() const { return output_; }
-
-  const vector<int> &Strides() const { return strides_; }
-
-  const vector<int> &Paddings() const { return paddings_; }
-
-  const vector<int> &Filters() const { return filter_; }
-
-  const vector<int> &TransFilters() const { return transformed_filter_; }
-
-  const vector<int> &Dilations() const { return dilations_; }
-
-  const vector<int> &OutputSize() const { return output_size_; }
-
-  const int &Groups() const { return groups; }
-
-  enum ExecMode {
-    EXEC_INVALID = 0,
-    EXEC_GEMM_FLOAT,
-    EXEC_DECONV3X3_FLOAT,
-    EXEC_DECONV4X4_FLOAT,
-    EXEC_DEPTHWISETRANS_FLOAT,
-    EXEC_CONVTRANS3x3s2_FLOAT,
-    EXEC_CONVTRANS_FLOAT,
-  };
-
-  ExecMode &ExecMode() const { return exec_mode_; }
-
- private:
-  GType *input_;
-  GType *output_;
-  GType *filter_;
-  GType *transformed_filter_;
-  vector<int> strides_;
-  vector<int> paddings_;
-  vector<int> dilations_;
-  vector<int> output_size_;
-  int groups;
-  mutable enum ExecMode exec_mode_;
-
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::DeconvArgs fpga_conv_args;
-  fpga::DWDeconvArgs fpga_DWDeconv_args;
-
- public:
-  const fpga::DeconvArgs &FpgaArgs() const { return fpga_conv_args; }
-  const fpga::DWDeconvArgs &FpgaDWDconvArgs() const {
-    return fpga_DWDeconv_args;
-  }
-  void SetFpgaArgs(const fpga::DeconvArgs &args) { fpga_conv_args = args; }
-  void SetFpgaArgs(const fpga::DWDeconvArgs &args) {
-    fpga_DWDeconv_args = args;
-  }
-#endif
-};
-
-#ifdef FUSION_DECONVADD_OP
-template <typename Dtype>
-class FusionDeconvAddParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvAddParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-  }
-  GType *Bias() const { return bias_; }
-
-  const int &Axis() const { return axis_; }
-
-  GType *Output() const { return output_; }
-
- protected:
-  GType *bias_;
-  int axis_;
-  GType *output_;
-};
-#endif
-
-#ifdef FUSION_DECONVADDRELU_OP
-template <typename Dtype>
-using FusionDeconvAddReluParam = FusionDeconvAddParam<Dtype>;
-#endif
-#ifdef FUSION_DECONVADDBN_OP
-template <typename Dtype>
-class FusionDeconvAddBNParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvAddBNParam(const VariableNameMap &inputs,
-                         const VariableNameMap &outputs,
-                         const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
-  }
-  RType *Output() const { return output_; }
-
-  const RType *InputBias() const { return input_bias_; }
-
-  const RType *InputMean() const { return input_mean_; }
-
-  const RType *InputScale() const { return input_scale_; }
-
-  const RType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  void SetNewScale(RType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(RType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const RType *NewScale() const { return new_scale_.get(); }
-
-  const RType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  RType *output_;
-  RType *input_bias_;
-  RType *input_mean_;
-  RType *input_scale_;
-  RType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  std::shared_ptr<RType> new_bias_;
-  std::shared_ptr<RType> new_scale_;
-};
-#endif
-#ifdef FUSION_DECONVBNRELU_OP
-template <typename Dtype>
-class FusionDeconvBNReluParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvBNReluParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-  }
-  RType *Output() const { return output_; }
-
-  const RType *InputBias() const { return input_bias_; }
-
-  const RType *InputMean() const { return input_mean_; }
-
-  const RType *InputScale() const { return input_scale_; }
-
-  const RType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  void SetNewScale(RType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(RType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const RType *NewScale() const { return new_scale_.get(); }
-
-  const RType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  RType *output_;
-  RType *input_bias_;
-  RType *input_mean_;
-  RType *input_scale_;
-  RType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  std::shared_ptr<RType> new_bias_;
-  std::shared_ptr<RType> new_scale_;
-};
-#endif
-#ifdef FUSION_DECONVADDBNRELU_OP
-template <typename Dtype>
-class FusionDeconvAddBNReluParam : public ConvTransposeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDeconvAddBNReluParam(const VariableNameMap &inputs,
-                             const VariableNameMap &outputs,
-                             const AttributeMap &attrs, Scope *scope)
-      : ConvTransposeParam<Dtype>(inputs, outputs, attrs, scope) {
-    output_ = OpParam::OutFrom<GType>(outputs, *scope);
-    input_bias_ = OpParam::InputBiasFrom<GType>(inputs, *scope);
-    input_mean_ = OpParam::InputMeanFrom<GType>(inputs, *scope);
-    input_scale_ = OpParam::InputScaleFrom<GType>(inputs, *scope);
-    input_variance_ = OpParam::InputVarianceFrom<GType>(inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-    momentum_ = OpParam::GetAttr<float>("momentum", attrs);
-    //    is_test_ = OpParam::GetAttr<bool>("is_test", attrs);
-  }
-  RType *Output() const { return output_; }
-
-  const RType *InputBias() const { return input_bias_; }
-
-  const RType *InputMean() const { return input_mean_; }
-
-  const RType *InputScale() const { return input_scale_; }
-
-  const RType *InputVariance() const { return input_variance_; }
-
-  const float &Epsilon() const { return epsilon_; }
-
-  const float &Momentum() const { return momentum_; }
-
-  const bool &IsTest() const { return is_test_; }
-
-  void SetNewScale(RType *new_scale) {
-    new_scale_.reset(new_scale, CLImageDeleter<Dtype>());
-  }
-
-  void SetNewBias(RType *new_bias) {
-    new_bias_.reset(new_bias, CLImageDeleter<Dtype>());
-  }
-
-  const RType *NewScale() const { return new_scale_.get(); }
-
-  const RType *NewBias() const { return new_bias_.get(); }
-
- protected:
-  RType *output_;
-  RType *input_bias_;
-  RType *input_mean_;
-  RType *input_scale_;
-  RType *input_variance_;
-  float epsilon_;
-  float momentum_;
-  bool is_test_;
-  std::shared_ptr<RType> new_bias_;
-  std::shared_ptr<RType> new_scale_;
-};
-#endif
-
-#ifdef FUSION_DECONVRELU_OP
-template <typename Dtype>
-using FusionDeconvReluParam = ConvTransposeParam<Dtype>;
-#endif
-
-#ifdef GRU_OP
-template <typename Dtype>
-class GruParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  /**
-   *
-   * @param inputs
-   * @param outputs
-   * @param attrs
-   * @param scope
-   * */
-  GruParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_input_ = InputFrom<GType>(inputs, *scope);
-    input_h0_ = InputH0From<GType>(inputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
-    input_weight_ = InputWeightFrom<GType>(inputs, *scope);
-
-    output_batch_gate_ = OutputBatchGateFrom<GType>(outputs, *scope);
-    output_batch_reset_hidden_prev_ =
-        OutputBatchResetHiddenPrevFrom<GType>(outputs, *scope);
-    output_batch_hidden_ = OutputBatchHiddenFrom<GType>(outputs, *scope);
-    output_hidden_ = OutputHiddenFrom<GType>(outputs, *scope);
-    activation_ = GetStringAttr("activation", attrs);
-    gate_activation_ = GetStringAttr("gate_activation", attrs);
-    is_reverse_ = GetAttr<bool>("is_reverse", attrs);
-  }
-  const GType *InputInput() const { return input_input_; }
-  const GType *InputWeight() const { return input_weight_; }
-  const GType *InputH0() const { return input_h0_; }
-  const GType *InputBias() const { return input_bias_; }
-  const std::string &Activation() const { return activation_; }
-  const std::string &GateActivation() const { return gate_activation_; }
-  const bool &IsReverse() const { return is_reverse_; }
-
-  GType *OutBatchGate() const { return output_batch_gate_; }
-  GType *OutBatchResetHiddenPrev() const {
-    return output_batch_reset_hidden_prev_;
-  }
-  GType *OutBatchHidden() const { return output_batch_hidden_; }
-  GType *OutHidden() const { return output_hidden_; }
-
- private:
-  GType *input_input_;
-  GType *input_h0_;
-  GType *input_bias_;
-  GType *input_weight_;
-
-  GType *output_batch_gate_;
-  GType *output_batch_reset_hidden_prev_;
-  GType *output_batch_hidden_;
-  GType *output_hidden_;
-  std::string activation_;
-  std::string gate_activation_;
-  bool is_reverse_;
-};
-#endif
-
-#ifdef GRU_UNIT_OP
-template <typename Dtype>
-class GruUnitParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-
- public:
-  GruUnitParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_input_ = InputFrom<GType>(inputs, *scope);
-    input_hidden_prev_ = InputHiddenPrevFrom<GType>(inputs, *scope);
-    input_bias_ = InputBiasFrom<GType>(inputs, *scope);
-    input_weight_ = InputWeightFrom<GType>(inputs, *scope);
-
-    output_gate_ = OutputGateFrom<GType>(outputs, *scope);
-    output_reset_hidden_prev_ =
-        OutputResetHiddenPrevFrom<GType>(outputs, *scope);
-    output_hidden_ = OutputHiddenFrom<GType>(outputs, *scope);
-    activation_ = GetAttr<int>("activation", attrs);
-    gate_activation_ = GetAttr<int>("gate_activation", attrs);
-  }
-  const GType *InputInput() const { return input_input_; }
-  const GType *InputWeight() const { return input_weight_; }
-  const GType *InputHiddenPrev() const { return input_hidden_prev_; }
-  const GType *InputBias() const { return input_bias_; }
-  const int &Activation() const { return activation_; }
-  const int &GateActivation() const { return gate_activation_; }
-
-  GType *OutGate() const { return output_gate_; }
-  GType *OutResetHiddenPrev() const { return output_reset_hidden_prev_; }
-  GType *OutHidden() const { return output_hidden_; }
-
- private:
-  GType *input_input_;
-  GType *input_hidden_prev_;
-  GType *input_bias_;
-  GType *input_weight_;
-
-  GType *output_gate_;
-  GType *output_reset_hidden_prev_;
-  GType *output_hidden_;
-  int activation_;
-  int gate_activation_;
-};
-#endif
-
-#ifdef FLATTEN_OP
-template <typename Dtype>
-class FlattenParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FlattenParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    axis = GetAttr<int>("axis", attrs);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-  const int &Axis() const { return axis; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  int axis;
-};
-#endif
-
-#ifdef SPLIT_OP
-template <typename Dtype>
-class SplitParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SplitParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    outs_ = OutMultiFrom<GType>(outputs, *scope);
-    axis = GetAttr<int>("axis", attrs);
-    num = GetAttr<int>("num", attrs);
-    sections = GetAttr<std::vector<int>>("sections", attrs);
-
-    //    for (int i = 0; i < outs_.size(); ++i) {
-    //      out_ts_.push_back(*scope.FindVar(outs_[i])->GetMutable());
-    //    }
-  }
-  GType *InputX() const { return input_x_; }
-  std::vector<GType *> Outs() const { return outs_; }
-  int Axis() const { return axis; }
-  int Num() const { return num; }
-  std::vector<int> Sections() const { return sections; }
-  //  std::vector<GType> OutTs() const { return out_ts_; }
-
- private:
-  GType *input_x_;
-  std::vector<GType *> outs_;
-  int axis;
-  int num;
-  std::vector<int> sections;
-//  std::vector<GType> out_ts_;
-#ifdef PADDLE_MOBILE_FPGA
-
- private:
-  fpga::SplitArgs fpga_split_args;
-
- public:
-  const fpga::SplitArgs &FpgaArgs() const { return fpga_split_args; }
-  void SetFpgaArgs(const fpga::SplitArgs &args) { fpga_split_args = args; }
-#endif
-};
-#endif
-
-#ifdef BILINEAR_INTERP_OP
-template <typename Dtype>
-class BilinearInterpParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  BilinearInterpParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    out_h_ = GetAttr<int>("out_h", attrs);
-    out_w_ = GetAttr<int>("out_w", attrs);
-    align_corners = GetAttr<bool>("align_corners", attrs);
-    align_mode = GetAttr<int>("align_mode", attrs);
-    if (HasAttr("scale", attrs)) {
-      has_scale_ = true;
-      scale_ = GetAttr<float>("scale", attrs);
-    }
-    LOG(kLOG_DEBUG1) << "has_scale_:  " << has_scale_;
-    LOG(kLOG_DEBUG1) << "scale_:  " << scale_;
-  }
-  const GType *InputX() const { return input_x_; }
-  const GType *InputOutPutSize() const { return input_outsize_; }
-  GType *Out() const { return out_; }
-  int OutH() const { return out_h_; }
-  int OutW() const { return out_w_; }
-  bool AlignCorners() const { return align_corners; }
-  int AlignMode() const { return align_mode; }
-  float Scale() const { return scale_; }
-  bool HasScale() const { return has_scale_; }
-
- private:
-  GType *input_x_;
-  GType *input_outsize_;
-  GType *out_;
-  int out_h_;
-  int out_w_;
-  bool align_corners;
-  int align_mode;
-  float scale_;
-  bool has_scale_;
-};
-#endif
-
-#ifdef NEAREST_INTERP_OP
-template <typename Dtype>
-class NearestInterpolationParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  NearestInterpolationParam(const VariableNameMap &inputs,
-                            const VariableNameMap &outputs,
-                            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    const bool has_out_size = HasVar("OutSize", inputs);
-
-    if (has_out_size) {
-      input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
-    }
-
-    out_ = OutFrom<GType>(outputs, *scope);
-
-    if (HasAttr("out_h", attrs)) {
-      out_h_ = GetAttr<int>("out_h", attrs);
-    } else if (HasAttr("out_h ", attrs)) {
-      // some models hurts ....   attr with space ..
-      out_h_ = GetAttr<int>("out_h ", attrs);
-    }
-
-    if (HasAttr("out_w", attrs)) {
-      out_w_ = GetAttr<int>("out_w", attrs);
-    } else if (HasAttr("out_w ", attrs)) {
-      // some models hurts ....   attr with space ..
-      out_w_ = GetAttr<int>("out_w ", attrs);
-    }
-
-    LOG(kLOG_DEBUG1) << "out_h_: " << out_h_;
-    LOG(kLOG_DEBUG1) << "out_w_: " << out_w_;
-
-    if (HasAttr("scale", attrs)) {
-      has_scale_ = true;
-      scale_ = GetAttr<float>("scale", attrs);
-    }
-    LOG(kLOG_DEBUG1) << "has_scale_:  " << has_scale_;
-    LOG(kLOG_DEBUG1) << "scale_:  " << scale_;
-  }
-  const GType *InputX() const { return input_x_; }
-  const GType *InputOutPutSize() const { return input_outsize_; }
-  GType *Out() const { return out_; }
-  int OutH() const { return out_h_; }
-  int OutW() const { return out_w_; }
-  float Scale() const { return scale_; }
-  bool HasScale() const { return has_scale_; }
-
- private:
-  GType *input_x_;
-  GType *input_outsize_;
-  GType *out_;
-  int out_h_;
-  int out_w_;
-  float scale_;
-  bool has_scale_;
-};
-#endif
-
-#ifdef SHAPE_OP
-template <typename Dtype>
-class ShapeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ShapeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *Input() const { return input_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_;
-  GType *out_;
-};
-#endif
-
-#ifdef TOP_K_OP
-template <typename Dtype>
-class TopKParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  TopKParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<GType>("X", inputs, *scope);
-    output_ = OpParam::GetVarValue<GType>("Out", outputs, *scope);
-    indices_ = OpParam::GetVarValue<GType>("Indices", outputs, *scope);
-    k_ = OpParam::GetAttr<int>("k", attrs);
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  GType *indices_;
-  int k_;
-};
-#endif  // TOP_K_OP
-
-#ifdef CAST_OP
-template <typename Dtype>
-class CastParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  CastParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-            const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<GType>("X", inputs, *scope);
-    output_ = OpParam::GetVarValue<GType>("Out", outputs, *scope);
-    input_type_ = OpParam::GetAttr<int>("in_dtype", attrs);
-    output_type_ = OpParam::GetAttr<int>("out_dtype", attrs);
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  int input_type_;
-  int output_type_;
-};
-#endif  // CAST_OP
-
-#ifdef QUANT_OP
-template <typename Dtype>
-class QuantizeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    // online
-    // scale = max(abs(x))
-    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, *scope);
-    // offline
-    if (inputs.count("InScale")) {
-      offline_ = true;
-      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, *scope);
-    }
-    // x = round(scale * x)
-    if (OpParam::HasAttr("round_type", attrs)) {
-      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
-    }
-  }
-
- public:
-  // op input
-  GType *input_;
-  // op output
-  GType *output_;
-  GType *online_scale_;
-  // quantize offline scale
-  GType *offline_scale_;
-  // if offine scale or not
-  bool offline_ = false;
-  // round method type
-  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
-};
-#endif
-
-#ifdef DEQUANT_OP
-template <typename Dtype>
-class DequantizeParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                  const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    activation_scale_ = OpParam::GetVarValue<GType>("Scale", inputs, *scope);
-    // dequantization is performed as x = x / static_scale / online_scale
-    if (OpParam::HasAttr("weight_scale", attrs)) {
-      weight_scale_ = OpParam::GetAttr<float>("weight_scale", attrs);
-    } else {
-      weight_scale_ = OpParam::GetAttr<float>("max_range", attrs);
-    }
-  }
-
- public:
-  // op input
-  GType *input_;
-  // op output
-  GType *output_;
-  GType *activation_scale_;
-  float weight_scale_;
-};
-#endif
-
-#if defined(FUSION_DEQUANT_BN_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||                             \
-    defined(FUSION_DEQUANT_BN_RELU_OP) ||                                 \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) ||                            \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-template <typename Dtype>
-class FusionDequantBNParam : public DequantizeParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDequantBNParam(const VariableNameMap &inputs,
-                       const VariableNameMap &outputs,
-                       const AttributeMap &attrs, Scope *scope)
-      : DequantizeParam<Dtype>(inputs, outputs, attrs, scope) {
-    // batch norm params
-    bn_mean_ = OpParam::GetVarValue<GType>("BNMean", inputs, *scope);
-    bn_variance_ = OpParam::GetVarValue<GType>("BNVariance", inputs, *scope);
-    bn_scale_ = OpParam::GetVarValue<GType>("BNScale", inputs, *scope);
-    bn_bias_ = OpParam::GetVarValue<GType>("BNBias", inputs, *scope);
-    epsilon_ = OpParam::GetAttr<float>("epsilon", attrs);
-  }
-
- public:
-  // batch norm
-  GType *bn_mean_;
-  GType *bn_variance_;
-  GType *bn_scale_;
-  GType *bn_bias_;
-  float epsilon_;
-};
-#endif
-
-#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) ||  \
-    defined(FUSION_DEQUANT_ADD_BN_OP) ||       \
-    defined(FUSION_DEQUANT_ADD_BN_QUANT_OP) || \
-    defined(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-template <typename Dtype>
-class FusionDequantAddBNParam : public FusionDequantBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDequantAddBNParam(const VariableNameMap &inputs,
-                          const VariableNameMap &outputs,
-                          const AttributeMap &attrs, Scope *scope)
-      : FusionDequantBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // element wise add params
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-    bias_ = OpParam::InputYFrom<GType>(inputs, *scope);
-  }
-
- public:
-  // elementwise add
-  int axis_;
-  GType *bias_;
-};
-#endif
-
-#ifdef FUSION_DEQUANT_ADD_BN_QUANT_OP
-template <typename Dtype>
-class FusionDequantAddBNQuantParam : public FusionDequantAddBNParam<Dtype> {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  FusionDequantAddBNQuantParam(const VariableNameMap &inputs,
-                               const VariableNameMap &outputs,
-                               const AttributeMap &attrs, Scope *scope)
-      : FusionDequantAddBNParam<Dtype>(inputs, outputs, attrs, scope) {
-    // scale output
-    online_scale_ = OpParam::GetVarValue<GType>("OutScale", outputs, *scope);
-    // offline
-    if (inputs.count("InScale")) {
-      offline_ = true;
-      offline_scale_ = OpParam::GetVarValue<GType>("InScale", inputs, *scope);
-    }
-    // x = round(scale * x)
-    if (OpParam::HasAttr("round_type", attrs)) {
-      round_type_ = OpParam::GetAttr<RoundType>("round_type", attrs);
-    }
-  }
-
- public:
-  GType *online_scale_;
-  // quantize offline scale
-  GType *offline_scale_;
-  // if offine scale or not
-  bool offline_ = false;
-  // round method type
-  // RoundType round_type_ = ROUND_NEAREST_AWAY_ZERO;
-  RoundType round_type_ = ROUND_NEAREST_TOWARDS_ZERO;
-};
-#endif
-
-#ifdef SEQUENCE_EXPAND_OP
-template <typename Dtype>
-class SequenceExpandParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SequenceExpandParam(const VariableNameMap &inputs,
-                      const VariableNameMap &outputs, const AttributeMap &attrs,
-                      Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    ref_level_ = -1;
-    if (OpParam::HasAttr("ref_level", attrs)) {
-      ref_level_ = OpParam::GetAttr<int>("ref_level", attrs);
-    }
-  }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-  int ref_level_;
-};
-#endif  // SEQUENCE_EXPAND_OP
-
-#ifdef SEQUENCE_POOL_OP
-template <typename Dtype>
-class SequencePoolParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  SequencePoolParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    pool_type_ = "MAX";
-    if (OpParam::HasAttr("pooltype", attrs)) {
-      pool_type_ = OpParam::GetStringAttr("pooltype", attrs);
-    }
-  }
-
- public:
-  GType *input_;
-  GType *output_;
-  std::string pool_type_;
-};
-#endif  // SEQUENCE_EXPAND_OP
-
-#ifdef LOD_RESET_OP
-template <typename Dtype>
-class LodResetParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LodResetParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    input_y_ = nullptr;
-    if (inputs.count("Y")) {
-      input_y_ = InputYFrom<GType>(inputs, *scope);
-    } else {
-      target_lod_ = OpParam::GetAttr<vector<int>>("target_lod", attrs);
-    }
-    if (HasAttr("append", attrs)) {
-      append = OpParam::GetAttr<bool>("append", attrs);
-    }
-  }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-  std::vector<int> target_lod_;
-  bool append;
-};
-#endif  // LOD_RESET_OP
-
-#ifdef LESS_THAN_OP
-template <typename Dtype>
-class CompareParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  CompareParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    axis_ = OpParam::GetAttr<int>("axis", attrs);
-  }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-  int axis_;
-};
-#endif  // LESS_THAN_OP
-
-#if defined(LOGICAL_AND_OP) || defined(LOGICAL_OR_OP) || defined(LOGICAL_XOR_OP)
-template <typename Dtype>
-class LogicalBinaryParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LogicalBinaryParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_y_ = InputYFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  const GType *InputY() const { return input_y_; }
-  GType *Out() const { return output_; }
-
- public:
-  GType *input_x_;
-  GType *input_y_;
-  GType *output_;
-};
-#endif  // LOGICAL_AND_OP LOGICAL_OR_OP LOGICAL_XOR_OP
-
-#ifdef LOGICAL_NOT_OP
-template <typename Dtype>
-class LogicalUnaryParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  LogicalUnaryParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return output_; }
-
- public:
-  GType *input_x_;
-  GType *output_;
-};
-#endif  // LOGICAL_NOT_OP
-
-#ifdef WRITE_TO_ARRAY_OP
-template <typename Dtype>
-class WriteToArrayParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  WriteToArrayParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<GType>("X", inputs, *scope);
-    index_ = OpParam::GetVarValue<GType>("I", inputs, *scope);
-    output_ = OpParam::GetVarValue<std::vector<GType>>("Out", outputs, *scope);
-  }
-
- public:
-  GType *input_;
-  GType *index_;
-  std::vector<GType> *output_;
-};
-#endif
-
-#ifdef READ_FROM_ARRAY_OP
-template <typename Dtype>
-class ReadFromArrayParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ReadFromArrayParam(const VariableNameMap &inputs,
-                     const VariableNameMap &outputs, const AttributeMap &attrs,
-                     Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_ = OpParam::GetVarValue<std::vector<GType>>("X", inputs, *scope);
-    index_ = OpParam::GetVarValue<GType>("I", inputs, *scope);
-    output_ = OpParam::GetVarValue<GType>("Out", outputs, *scope);
-  }
-
- public:
-  std::vector<GType> *input_;
-  GType *index_;
-  GType *output_;
-};
-#endif
-
-#ifdef IS_EMPTY_OP
-template <typename Dtype>
-class IsEmptyParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  IsEmptyParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-               const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return output_; }
-
- public:
-  GType *input_x_;
-  GType *output_;
-};
-#endif  // IS_EMPTY_OP
-
-#ifdef INCREMENT_OP
-template <typename Dtype>
-class IncrementParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  IncrementParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-                 const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    output_ = OutFrom<GType>(outputs, *scope);
-    step_ = OpParam::GetAttr<float>("step", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return output_; }
-  float Step() const { return step_; }
-
- public:
-  GType *input_x_;
-  GType *output_;
-  float step_;
-};
-#endif  // INCREMENT_OP
-#ifdef PAD2D_OP
-template <typename Dtype>
-class Pad2DParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  Pad2DParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-             const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    paddings_ = OpParam::GetAttr<std::vector<int>>("paddings", attrs);
-    pad_value_ = OpParam::GetAttr<float>("pad_value", attrs);
-    mode_ = OpParam::GetStringAttr("mode", attrs);
-    DLOG << "mode" << mode_;
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
-  std::vector<int> paddings_;
-  float pad_value_;
-  std::string mode_;
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-#endif
-#ifdef EXP_OP
-template <typename Dtype>
-class EXPParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  EXPParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-           const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-  }
-  const GType *InputX() const { return input_x_; }
-  GType *Out() const { return out_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-#endif
-
-#ifdef PIXEL_SHUFFLE_OP
-template <typename Dtype>
-class PixelShuffleParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  PixelShuffleParam(const VariableNameMap &inputs,
-                    const VariableNameMap &outputs, const AttributeMap &attrs,
-                    Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    upscale_factor_ = GetAttr<int>("upscale_factor", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  const int &upscale_factor() const { return upscale_factor_; }
-
- private:
-  GType *input_x_;
-  GType *out_;
-  int upscale_factor_;
-};
-#endif
-
-#ifdef GRID_SAMPLER_OP
-template <typename Dtype>
-class GridSamplerParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  GridSamplerParam(const VariableNameMap &inputs,
-                   const VariableNameMap &outputs, const AttributeMap &attrs,
-                   Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    grid_ = GridFrom<GType>(inputs, *scope);
-    output_ = OutputFrom<GType>(outputs, *scope);
-  }
-
-  const GType *InputX() const { return input_x_; }
-  const GType *Grid() const { return grid_; }
-
-  GType *Output() const { return output_; }
-
- private:
-  GType *input_x_;
-  GType *grid_;
-  GType *output_;
-};
-#endif
-
-#ifdef EXPAND_OP
-template <typename Dtype>
-class ExpandParam : public OpParam {
-  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
-  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
-
- public:
-  ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
-              const AttributeMap &attrs, Scope *scope)
-      : OpParam(inputs, outputs, attrs, scope) {
-    input_x_ = InputXFrom<GType>(inputs, *scope);
-    out_ = OutFrom<GType>(outputs, *scope);
-    expand_times = OpParam::GetAttr<std::vector<int>>("expand_times", attrs);
-  }
-
-  const GType *InputX() const { return input_x_; }
-
-  GType *Out() const { return out_; }
-
-  std::vector<int> expand_times;
-
- private:
-  GType *input_x_;
-  GType *out_;
-};
-
-#endif
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/pad2d_op.cpp b/mobile/src/operators/pad2d_op.cpp
deleted file mode 100755
index d3ed4762e4728985d54d0c1a67f5d7852405e1f5..0000000000000000000000000000000000000000
--- a/mobile/src/operators/pad2d_op.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#include "operators/pad2d_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void Pad2DOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  const auto &paddings = this->param_.paddings_;
-  PADDLE_MOBILE_ENFORCE(paddings.size() == 4,
-                        "Size of paddings should be equal to 4.");
-
-  input_dims[2] += paddings[0] + paddings[1];
-  input_dims[3] += paddings[2] + paddings[3];
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(pad2d, ops::Pad2DOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2DOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(pad2d, ops::Pad2DOp);
-#endif
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/pad2d_op.h b/mobile/src/operators/pad2d_op.h
deleted file mode 100644
index 1a80cbac40f0c9bea36283373763f03489d073d2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/pad2d_op.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PAD2D_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/pad2d_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Pad2D, Pad2DParam, Pad2DKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // PAD2D_OP
diff --git a/mobile/src/operators/pixel_shuffle_op.cpp b/mobile/src/operators/pixel_shuffle_op.cpp
deleted file mode 100644
index 9105a72cfbddddbe39ecbbe2f35da204ba118f18..0000000000000000000000000000000000000000
--- a/mobile/src/operators/pixel_shuffle_op.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PIXEL_SHUFFLE_OP
-
-#include "operators/pixel_shuffle_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void PixelShuffleOp<Dtype, T>::InferShape() const {
-  auto x_dims = this->param_.InputX()->dims();
-  int n = x_dims[0];
-  int c = x_dims[1];
-  int h = x_dims[2];
-  int w = x_dims[3];
-  int upscale_factor = this->param_.upscale_factor();
-  this->param_.Out()->Resize(
-      framework::make_ddim({n, c / (upscale_factor * upscale_factor),
-                            h * upscale_factor, w * upscale_factor}));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(pixel_shuffle, ops::PixelShuffleOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/pixel_shuffle_op.h b/mobile/src/operators/pixel_shuffle_op.h
deleted file mode 100644
index a1c6f8e1adb0c4f52e54974080aaa80e6ebe295f..0000000000000000000000000000000000000000
--- a/mobile/src/operators/pixel_shuffle_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PIXEL_SHUFFLE_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/pixel_shuffle_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class PixelShuffleOp : public framework::OperatorWithKernel<
-                           DeviceType, PixelShuffleParam<DeviceType>,
-                           operators::PixelShuffleKernel<DeviceType, T>> {
- public:
-  PixelShuffleOp(const string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, PixelShuffleParam<DeviceType>,
-            operators::PixelShuffleKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/polygon_box_transform_op.cpp b/mobile/src/operators/polygon_box_transform_op.cpp
deleted file mode 100644
index a3eed0e2f30651ea2f7c3250187b30126ba4d283..0000000000000000000000000000000000000000
--- a/mobile/src/operators/polygon_box_transform_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#include "operators/polygon_box_transform_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void PolygonBoxTransformOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (Input) of get_shape op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
-                        "Output (Output) of get_shape op should not be null.");
-
-  auto input_dims = this->param_.Input()->dims();
-
-  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "input's rank must be 4.");
-  PADDLE_MOBILE_ENFORCE(input_dims[1] % 2 == 0,
-                        "input's second dimension must be even.");
-
-  this->param_.Output()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(polygon_box_transform, ops::PolygonBoxTransformOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/polygon_box_transform_op.h b/mobile/src/operators/polygon_box_transform_op.h
deleted file mode 100644
index a4d1975e58b0374e776f3995fc1803419cacbfd2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/polygon_box_transform_op.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POLYGONBOXTRANSFORM_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/polygon_box_transform_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class PolygonBoxTransformOp
-    : public framework::OperatorWithKernel<
-          DeviceType, PolygonBoxTransformParam<DeviceType>,
-          operators::PolygonBoxTransformKernel<DeviceType, T>> {
- public:
-  PolygonBoxTransformOp(const std::string &type, const VariableNameMap &inputs,
-                        const VariableNameMap &outputs,
-                        const framework::AttributeMap &attrs,
-                        framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, PolygonBoxTransformParam<DeviceType>,
-            operators::PolygonBoxTransformKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, PolygonBoxTransformParam<DeviceType>,
-      operators::PolygonBoxTransformKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/pool_op.cpp b/mobile/src/operators/pool_op.cpp
deleted file mode 100644
index f73fe01cc7f8df737b19986b81a4dcf09ba8af4b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/pool_op.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#include "operators/pool_op.h"
-#include <vector>
-#include "framework/op_proto_maker.h"
-#include "framework/op_registry.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
-                   bool ceil_mode) {
-  int output_size;
-  if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  } else {
-    output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
-  }
-  return output_size;
-}
-template <typename DeviceType, typename T>
-void PoolOp<DeviceType, T>::InferShape() const {
-  auto in_x_dims = this->param_.Input()->dims();
-  std::vector<int> ksize = this->param_.Ksize();
-  std::vector<int> paddings = this->param_.Paddings();
-  std::vector<int> strides = this->param_.Strides();
-  bool ceil_mode = this->param_.isCeilMode();
-
-  if (this->param_.isGlobalPooling()) {
-    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
-    for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
-      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
-    }
-  }
-  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
-  for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                          paddings[i], strides[i], ceil_mode));
-  }
-  this->param_.Output()->Resize(framework::make_ddim(output_shape));
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(pool2d, ops::PoolOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(pool2d, ops::PoolOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(pool2d, ops::PoolOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/pool_op.h b/mobile/src/operators/pool_op.h
deleted file mode 100644
index 861430f10bab03941aea643fabc3937c18f71376..0000000000000000000000000000000000000000
--- a/mobile/src/operators/pool_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef POOL_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/pool_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class PoolOp : public framework::OperatorWithKernel<
-                   DeviceType, PoolParam<DeviceType>,
-                   operators::PoolKernel<DeviceType, T>> {
- public:
-  PoolOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
-                                      operators::PoolKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- private:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/prelu_op.cpp b/mobile/src/operators/prelu_op.cpp
deleted file mode 100644
index 0c373ca7112b6919b3476202f4919f71847f0a6c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/prelu_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRELU_OP
-
-#include "operators/prelu_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void PReluOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-/*
- * @b 每一个 op 都需要注册一下的,
- *    USE_OP的参数 和 REGISTER_OPERATOR的第一个参数
- * 都是需要和model中类型对应起来的
- * */
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/prelu_op.h b/mobile/src/operators/prelu_op.h
deleted file mode 100644
index 92c2e7e62040d753d0ac40e6bc82afb5c1082e9d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/prelu_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PRELU_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/prelu_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class PReluOp : public framework::OperatorWithKernel<
-                    DeviceType, PReluParam<DeviceType>,
-                    operators::PReluKernel<DeviceType, T>> {
- public:
-  PReluOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, PReluParam<DeviceType>,
-                                      operators::PReluKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/prior_box_op.cpp b/mobile/src/operators/prior_box_op.cpp
deleted file mode 100644
index da37273de5c45f5ab132ef1961fc7c7f71842ffd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/prior_box_op.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "operators/prior_box_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef PRIORBOX_OP
-template <typename Dtype, typename T>
-void PriorBoxOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.Input()->dims();
-  auto input_image_dims = this->param_.InputImage()->dims();
-  auto min_sizes = this->param_.MinSizes();
-  auto max_sizes = this->param_.MaxSizes();
-  auto variances = this->param_.Variances();
-  auto aspect_ratios = this->param_.AspectRatios();
-  bool flip = this->param_.Flip();
-  std::vector<float> aspect_ratios_vec;
-  ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
-
-  size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
-  if (!max_sizes.empty()) {
-    num_priors += max_sizes.size();
-  }
-
-  std::vector<int64_t> dim_vec(4);
-  dim_vec[0] = input_dims[2];
-  dim_vec[1] = input_dims[3];
-  dim_vec[2] = num_priors;
-  dim_vec[3] = 4;
-  this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
-  this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
-}
-#endif  // PRIORBOX_OP
-
-#ifdef DENSITY_PRIORBOX_OP
-template <typename Dtype, typename T>
-void DensityPriorBoxOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.Input()->dims();
-  auto input_image_dims = this->param_.InputImage()->dims();
-
-  auto &fixed_sizes = this->param_.FixedSizes();
-  auto &fixed_ratios = this->param_.FixedRatios();
-  auto &densities = this->param_.Densities();
-  bool flatten = this->param_.FlattenTo2d();
-
-  size_t num_priors = 0;
-  for (size_t i = 0; i < densities.size(); ++i) {
-    num_priors += (fixed_ratios.size()) * (pow(densities[i], 2));
-  }
-  if (!flatten) {
-    std::vector<int64_t> dim_vec(4);
-    dim_vec[0] = input_dims[2];
-    dim_vec[1] = input_dims[3];
-    dim_vec[2] = num_priors;
-    dim_vec[3] = 4;
-    this->param_.OutputBoxes()->Resize(framework::make_ddim(dim_vec));
-    this->param_.OutputVariances()->Resize(framework::make_ddim(dim_vec));
-  } else {
-    int64_t dim0 = input_dims[2] * input_dims[3] * num_priors;
-    this->param_.OutputBoxes()->Resize(framework::make_ddim({dim0, 4}));
-    this->param_.OutputVariances()->Resize(framework::make_ddim({dim0, 4}));
-  }
-}
-#endif  // DENSITY_PRIORBOX_OP
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-#ifdef PRIORBOX_OP
-REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
-#endif  // PRIORBOX_OP
-#ifdef DENSITY_PRIORBOX_OP
-REGISTER_OPERATOR_CPU(density_prior_box, ops::DensityPriorBoxOp);
-#endif  // DENSITY_PRIORBOX_OP
-#endif  // PADDLE_MOBILE_CPU
-
-#ifdef PADDLE_MOBILE_CL
-#ifdef PRIORBOX_OP
-REGISTER_OPERATOR_CL(prior_box, ops::PriorBoxOp);
-#endif  // PRIORBOX_OP
-#ifdef DENSITY_PRIORBOX_OP
-REGISTER_OPERATOR_CL(density_prior_box, ops::DensityPriorBoxOp);
-#endif  // DENSITY_PRIORBOX_OP
-#endif  // PADDLE_MOBILE_CL
diff --git a/mobile/src/operators/prior_box_op.h b/mobile/src/operators/prior_box_op.h
deleted file mode 100644
index 7a3c0466a01bb39131972fe699c66c5aa53f6a54..0000000000000000000000000000000000000000
--- a/mobile/src/operators/prior_box_op.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/prior_box_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-#ifdef PRIORBOX_OP
-DECLARE_OPERATOR(PriorBox, PriorBoxParam, PriorBoxKernel);
-#endif
-
-#ifdef DENSITY_PRIORBOX_OP
-DECLARE_OPERATOR(DensityPriorBox, DensityPriorBoxParam, DensityPriorBoxKernel);
-#endif
-
-}  // namespace operators
-}  // namespace paddle_mobile
diff --git a/mobile/src/operators/quantize_op.cpp b/mobile/src/operators/quantize_op.cpp
deleted file mode 100644
index bf12ca2f83c44e15848aad80dad8ff7536fe599c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/quantize_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#include "operators/quantize_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void QuantizeOp<DeviceType, T>::InferShape() const {
-  const auto &input_dims = this->param_.input_->dims();
-  this->param_.output_->Resize(input_dims);
-  auto scale_dims = framework::make_ddim(std::vector<int>{1});
-  this->param_.online_scale_->Resize(scale_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
-#endif
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/quantize_op.h b/mobile/src/operators/quantize_op.h
deleted file mode 100644
index 253113ad4bf04ba7bbc45e838211e41d2e2811fd..0000000000000000000000000000000000000000
--- a/mobile/src/operators/quantize_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef QUANT_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/quantize_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class QuantizeOp : public framework::OperatorWithKernel<
-                       DeviceType, QuantizeParam<DeviceType>,
-                       operators::QuantizeKernel<DeviceType, T>> {
- public:
-  QuantizeOp(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, QuantizeParam<DeviceType>,
-                                      operators::QuantizeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // QUANT_OP
diff --git a/mobile/src/operators/range_op.cpp b/mobile/src/operators/range_op.cpp
deleted file mode 100644
index b7abb52f0f49bf271eebed1bf94544b17fa26fbf..0000000000000000000000000000000000000000
--- a/mobile/src/operators/range_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#include "operators/range_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void RangeOp<Dtype, T>::InferShape() const {
-  auto s_dims = this->param_.Start()->dims();
-  PADDLE_MOBILE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
-                        "The shape of Input(Start) should be [1].");
-  auto e_dims = this->param_.End()->dims();
-  PADDLE_MOBILE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
-                        "The shape of Input(End) should be [1].");
-  auto step_dims = this->param_.Step()->dims();
-  PADDLE_MOBILE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
-                        "The shape of Input(Step) should be [1].");
-  this->param_.Output()->Resize({-1});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(range, ops::RangeOp);
-#endif
-
-#endif  // ASSIGN_OP
diff --git a/mobile/src/operators/range_op.h b/mobile/src/operators/range_op.h
deleted file mode 100644
index a3ca1a56ff3d51e7020d165686c9a673873414c3..0000000000000000000000000000000000000000
--- a/mobile/src/operators/range_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RANGE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/range_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(Range, RangeParam, RangeKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/reduce_prod_op.cpp b/mobile/src/operators/reduce_prod_op.cpp
deleted file mode 100644
index 9eb4866d4f3fc6fdfab4336ac84cd433e113c307..0000000000000000000000000000000000000000
--- a/mobile/src/operators/reduce_prod_op.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#include "operators/reduce_prod_op.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ReduceProdOp<Dtype, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (X) of ReduceOp op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Output() != nullptr,
-                        "Output (Output) of ReduceOp op should not be null.");
-
-  auto x_dims = this->param_.Input()->dims();
-  auto x_rank = x_dims.size();
-  PADDLE_MOBILE_ENFORCE(x_rank <= 6,
-                        "Tensors with rank at most 6 are supported.");
-  auto dims = this->param_.getDim();
-  for (size_t i = 0; i < dims.size(); ++i) {
-    if (dims[i] < 0) dims[i] = x_rank + dims[i];
-    PADDLE_MOBILE_ENFORCE(
-        dims[i] < x_rank,
-        "The dim should be in the range [-rank(input), rank(input)).");
-  }
-  sort(dims.begin(), dims.end());
-  bool reduce_all = this->param_.isReduceAll();
-  bool keep_dim = this->param_.isKeepDim();
-  if (reduce_all) {
-    if (keep_dim)
-      this->param_.Output()->Resize(
-          framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
-    else
-      this->param_.Output()->Resize({1});
-  } else {
-    auto dims_vector = vectorize(x_dims);
-    if (keep_dim) {
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = 1;
-      }
-    } else {
-      const int kDelFlag = -2;
-      for (size_t i = 0; i < dims.size(); ++i) {
-        dims_vector[dims[i]] = kDelFlag;
-      }
-      dims_vector.erase(
-          remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
-          dims_vector.end());
-    }
-    auto out_dims = framework::make_ddim(dims_vector);
-    this->param_.Output()->Resize(out_dims);
-    if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-      if (dims[0] != 0) {
-        // Only pass LoD when not reducing on the first dim.
-        this->param_.Output()->set_lod(this->param_.Input()->lod());
-      }
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(reduce_prod, ops::ReduceProdOp);
-#endif
-
-#endif  // REDUCE_PROD_OP
diff --git a/mobile/src/operators/reduce_prod_op.h b/mobile/src/operators/reduce_prod_op.h
deleted file mode 100644
index 46af419d2598b8dae123468c899a43c3d4a33c0c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/reduce_prod_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef REDUCE_PROD_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/reduce_prod_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-DECLARE_OPERATOR(ReduceProd, ReduceProdParam, ReduceProdKernel);
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/reshape2_op.cpp b/mobile/src/operators/reshape2_op.cpp
deleted file mode 100644
index 29712e181814f78c0cc7224ceab6e4eaaf1c721d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/reshape2_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#include "operators/reshape2_op.h"
-#include <vector>
-#include "operators/kernel/reshape_kernel.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void Reshape2Op<Dtype, T>::InferShape() const {
-  if (this->param_.InputShape() != nullptr) {
-    return;
-  }
-  auto &shape = this->param_.Shape();
-  auto input_x_dims = this->param_.InputX()->dims();
-  bool shouldResize = true;
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    auto input_dim_size = input_x_dims.size();
-    if (input_dim_size > 4) {
-      for (int i = 0; i < input_dim_size - 4; ++i) {
-        if (input_x_dims[i] != 0 && input_x_dims[i] != 1) {
-          shouldResize = false;
-          break;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int64_t> temp_intput_dims;
-        temp_intput_dims.reserve(static_cast<size_t>(4));
-        for (int i = input_dim_size - 4; i < input_dim_size; ++i) {
-          temp_intput_dims.push_back(input_x_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_intput_dims);
-        this->param_.InputX()->Resize(temp_ddim);
-        input_x_dims = this->param_.InputX()->dims();
-      }
-    }
-  }
-
-  auto out_dims = ValidateShape(shape, input_x_dims);
-  this->param_.Out()->Resize(out_dims);
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    input_x_dims = this->param_.InputX()->dims();
-    shouldResize = true;
-    if (out_dims.size() > 4) {
-      for (int i = 0; i < out_dims.size() - 4; ++i) {
-        if (out_dims[i] != 0 && out_dims[i] != 1) {
-          shouldResize = false;
-          break;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int64_t> temp_output_dims;
-        temp_output_dims.reserve(static_cast<size_t>(4));
-        for (int i = out_dims.size() - 4; i < out_dims.size(); ++i) {
-          temp_output_dims.push_back(out_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_output_dims);
-        this->param_.Out()->Resize(temp_ddim);
-      }
-    }
-  }
-  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
-  for (int i = 0; i < input_x_dims.size(); ++i) {
-    xshape_dims[i + 1] = input_x_dims[i];
-  }
-  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    this->param_.OutputXShape()->Resize(input_x_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(reshape2, ops::Reshape2Op);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op);
-#endif
-
-#endif
diff --git a/mobile/src/operators/reshape2_op.h b/mobile/src/operators/reshape2_op.h
deleted file mode 100644
index 19c5e59f71d02228af019e8edf1d9f7edc75811d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/reshape2_op.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE2_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/reshape2_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class Reshape2Op : public framework::OperatorWithKernel<
-                       DeviceType, Reshape2Param<DeviceType>,
-                       operators::Reshape2Kernel<DeviceType, T>> {
- public:
-  Reshape2Op(const std::string &type, const VariableNameMap &inputs,
-             const VariableNameMap &outputs,
-             const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, Reshape2Param<DeviceType>,
-                                      operators::Reshape2Kernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, Reshape2Param<DeviceType>,
-      operators::Reshape2Kernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/reshape_op.cpp b/mobile/src/operators/reshape_op.cpp
deleted file mode 100644
index a58a607207c4fcb2b46868131f2257c1719befbe..0000000000000000000000000000000000000000
--- a/mobile/src/operators/reshape_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#include "operators/reshape_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ReshapeOp<Dtype, T>::InferShape() const {
-  /// todo: add InputShape() detection.
-  auto &shape = this->param_.Shape();
-  auto input_x_dims = this->param_.InputX()->dims();
-  auto out_dims = ValidateShape(shape, input_x_dims);
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(reshape, ops::ReshapeOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(reshape, ops::ReshapeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/reshape_op.h b/mobile/src/operators/reshape_op.h
deleted file mode 100644
index 67e86044ea63a250fa5c944c4470d72c3b1c1bd7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/reshape_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESHAPE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/reshape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ReshapeOp : public framework::OperatorWithKernel<
-                      DeviceType, ReshapeParam<DeviceType>,
-                      operators::ReshapeKernel<DeviceType, T>> {
- public:
-  ReshapeOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ReshapeParam<DeviceType>,
-                                      operators::ReshapeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/resize_op.cpp b/mobile/src/operators/resize_op.cpp
deleted file mode 100644
index fcdf59b4730d72236f19b3105cadea07f87d58b7..0000000000000000000000000000000000000000
--- a/mobile/src/operators/resize_op.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#include "operators/resize_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ResizeOp<Dtype, T>::InferShape() const {
-  auto out_dims = CalOutputShape(this->param_);
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/resize_op.h b/mobile/src/operators/resize_op.h
deleted file mode 100644
index 6088ad4f51d91022990a999a2d557a80d3853253..0000000000000000000000000000000000000000
--- a/mobile/src/operators/resize_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef RESIZE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/resize_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ResizeOp : public framework::OperatorWithKernel<
-                     DeviceType, ResizeParam<DeviceType>,
-                     operators::ResizeKernel<DeviceType, T>> {
- public:
-  ResizeOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-           framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ResizeParam<DeviceType>,
-                                      operators::ResizeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/scale_op.cpp b/mobile/src/operators/scale_op.cpp
deleted file mode 100644
index 4236d1203bcef57dbf3bd0760ef38c0099dd5018..0000000000000000000000000000000000000000
--- a/mobile/src/operators/scale_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#include "operators/scale_op.h"
-#include <vector>
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void ScaleOp<Dtype, T>::InferShape() const {
-  auto input_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(input_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(scale, ops::ScaleOp);
-#endif
-#endif
diff --git a/mobile/src/operators/scale_op.h b/mobile/src/operators/scale_op.h
deleted file mode 100644
index aacacd92453449f32542a26ffd60f54b464a1483..0000000000000000000000000000000000000000
--- a/mobile/src/operators/scale_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SCALE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/scale_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ScaleOp : public framework::OperatorWithKernel<
-                    DeviceType, ScaleParam<DeviceType>,
-                    operators::ScaleKernel<DeviceType, T>> {
- public:
-  ScaleOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ScaleParam<DeviceType>,
-                                      operators::ScaleKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/sequence_ops/sequence_expand_op.cpp b/mobile/src/operators/sequence_ops/sequence_expand_op.cpp
deleted file mode 100644
index a1ff83981328e2ffa8013e60a2019f1f87fc24ab..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sequence_ops/sequence_expand_op.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_EXPAND_OP
-
-#include "operators/sequence_ops/sequence_expand_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SequenceExpandOp<DeviceType, T>::InferShape() const {
-  const auto *input_x = this->param_.input_x_;
-  const auto *input_y = this->param_.input_y_;
-  const auto &x_lod = input_x->lod();
-  const auto &y_lod = input_y->lod();
-  int ref_level = this->param_.ref_level_;
-  if (ref_level == -1) ref_level = y_lod.size() - 1;
-
-  auto out_dims = input_x->dims();
-  int64_t out_first_dim = 0;
-
-  if (y_lod[ref_level].size() > 1) {
-    for (size_t i = 1; i < y_lod[ref_level].size(); ++i) {
-      int x_seq_len = 1;
-      if (x_lod.size() == 1) {
-        x_seq_len = x_lod[0][i] - x_lod[0][i - 1];
-      }
-      out_first_dim +=
-          (y_lod[ref_level][i] - y_lod[ref_level][i - 1]) * x_seq_len;
-    }
-    out_dims[0] = out_first_dim;
-  }
-  this->param_.output_->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sequence_expand, ops::SequenceExpandOp);
-#endif
-
-#endif  // SEQUENCE_EXPAND_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_expand_op.h b/mobile/src/operators/sequence_ops/sequence_expand_op.h
deleted file mode 100644
index f854272d7b2c424563ab70dba8a0f1113e038cf6..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sequence_ops/sequence_expand_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_EXPAND_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SequenceExpandOp : public framework::OperatorWithKernel<
-                             DeviceType, SequenceExpandParam<DeviceType>,
-                             operators::SequenceExpandKernel<DeviceType, T>> {
- public:
-  SequenceExpandOp(const std::string &type, const VariableNameMap &inputs,
-                   const VariableNameMap &outputs,
-                   const framework::AttributeMap &attrs,
-                   framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, SequenceExpandParam<DeviceType>,
-            operators::SequenceExpandKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_EXPAND_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_pool_op.cpp b/mobile/src/operators/sequence_ops/sequence_pool_op.cpp
deleted file mode 100644
index 4165d8ef60f5eb649c3acc9648d6cebe8e7f8d2c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sequence_ops/sequence_pool_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_POOL_OP
-
-#include "operators/sequence_ops/sequence_pool_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SequencePoolOp<DeviceType, T>::InferShape() const {
-  const auto *input = this->param_.input_;
-  auto out_dims = input->dims();
-  out_dims[0] = input->lod()[0].size() - 1;
-  this->param_.output_->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sequence_pool, ops::SequencePoolOp);
-#endif
-
-#endif  // SEQUENCE_POOL_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_pool_op.h b/mobile/src/operators/sequence_ops/sequence_pool_op.h
deleted file mode 100644
index aae892f9f3bb87d52100ee148d5653b4ab6fa657..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sequence_ops/sequence_pool_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_POOL_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SequencePoolOp : public framework::OperatorWithKernel<
-                           DeviceType, SequencePoolParam<DeviceType>,
-                           operators::SequencePoolKernel<DeviceType, T>> {
- public:
-  SequencePoolOp(const std::string &type, const VariableNameMap &inputs,
-                 const VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, SequencePoolParam<DeviceType>,
-            operators::SequencePoolKernel<DeviceType, T>>(type, inputs, outputs,
-                                                          attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_POOL_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp b/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp
deleted file mode 100644
index 602e0d2975adcdc7ff6c49dc4a6ed4de1de38d64..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sequence_ops/sequence_softmax_op.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_SOFTMAX_OP
-
-#include "operators/sequence_ops/sequence_softmax_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SequenceSoftmaxOp<DeviceType, T>::InferShape() const {
-  const auto *input_x = this->param_.InputX();
-  const auto &x_lod = input_x->lod();
-
-  this->param_.Out()->Resize(input_x->dims());
-  this->param_.Out()->set_lod(input_x->lod());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sequence_softmax, ops::SequenceSoftmaxOp);
-#endif
-
-#endif  // SEQUENCE_SOFTMAX_OP
diff --git a/mobile/src/operators/sequence_ops/sequence_softmax_op.h b/mobile/src/operators/sequence_ops/sequence_softmax_op.h
deleted file mode 100644
index f0578f6ed36e770254b7fca9925fa0a41daefa52..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sequence_ops/sequence_softmax_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SEQUENCE_SOFTMAX_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sequence_kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class SequenceSoftmaxOp : public framework::OperatorWithKernel<
-                              DeviceType, SoftmaxParam<DeviceType>,
-                              operators::SequenceSoftmaxKernel<DeviceType, T>> {
- public:
-  SequenceSoftmaxOp(const std::string &type, const VariableNameMap &inputs,
-                    const VariableNameMap &outputs,
-                    const framework::AttributeMap &attrs,
-                    framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, SoftmaxParam<DeviceType>,
-            operators::SequenceSoftmaxKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // SEQUENCE_SOFTMAX_OP
diff --git a/mobile/src/operators/shape_op.cpp b/mobile/src/operators/shape_op.cpp
deleted file mode 100644
index f3ef72c16f049df4b0f3ea346dfb2dd8c7d39d3e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/shape_op.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#include "operators/shape_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void ShapeOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.Input() != nullptr,
-                        "Input (Input) of get_shape op should not be null.");
-  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
-                        "Output (Out) of get_shape op should not be null.");
-  this->param_.Out()->Resize({this->param_.Input()->dims().size()});
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/shape_op.h b/mobile/src/operators/shape_op.h
deleted file mode 100644
index 05bc611bc555429458f04fc18e18b9475424c948..0000000000000000000000000000000000000000
--- a/mobile/src/operators/shape_op.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SHAPE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/shape_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class ShapeOp : public framework::OperatorWithKernel<
-                    DeviceType, ShapeParam<DeviceType>,
-                    operators::ShapeKernel<DeviceType, T>> {
- public:
-  ShapeOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, ShapeParam<DeviceType>,
-                                      operators::ShapeKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/slice_op.cpp b/mobile/src/operators/slice_op.cpp
deleted file mode 100644
index 29fe870ae30e9f003d2f3c882a8f007c835d0f38..0000000000000000000000000000000000000000
--- a/mobile/src/operators/slice_op.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#include "operators/slice_op.h"
-#include <algorithm>
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void SliceOp<Dtype, T>::InferShape() const {
-  auto axes = this->param_.axes_;
-  auto input = this->param_.input_;
-  auto output = this->param_.output_;
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    auto output_dims = output->dims();
-    auto output_dims_size = output_dims.size();
-    bool should_resize = true;
-    if (output_dims_size > 4) {
-      for (int i = 0; i < output_dims_size - 4; ++i) {
-        if (output_dims[i] != 0 && output_dims[i] != 1) {
-          should_resize = false;
-          break;
-        }
-      }
-      if (should_resize) {
-        std::vector<int64_t> temp_output_dims;
-        temp_output_dims.reserve(static_cast<size_t>(4));
-        for (int i = output_dims_size - 4; i < output_dims_size; ++i) {
-          temp_output_dims.push_back(output_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_output_dims);
-        this->param_.output_->Resize(temp_ddim);
-      }
-    }
-  }
-  PADDLE_MOBILE_ENFORCE(axes.size() == 1, "axes size should equals 1");
-  PADDLE_MOBILE_ENFORCE(input->dims().size() == output->dims().size(),
-                        "input dim size should equals output dim size");
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    PADDLE_MOBILE_ENFORCE(
-        output->dims().size() -
-                (axes[0] - (this->param_.original_output_dims_size_ -
-                            this->param_.output_->dims().size())) ==
-            3,
-        "op only support slice channel now");
-  }
-  auto starts = this->param_.starts_;
-  auto ends = this->param_.ends_;
-  framework::DDim out_dims(input->dims());
-  PADDLE_MOBILE_ENFORCE(starts.size() == ends.size(),
-                        "starts.size should equal ends.size");
-  PADDLE_MOBILE_ENFORCE(axes.size() == starts.size(),
-                        "axes.size should equal starts.size");
-  int dim_value, start, end;
-  for (size_t i = 0; i < axes.size(); ++i) {
-    int axis = axes[i] - (this->param_.original_output_dims_size_ -
-                          this->param_.output_->dims().size());
-    dim_value = out_dims[axis];
-    if (dim_value > 0) {
-      start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
-      end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
-      start = std::max(start, 0);
-      end = std::max(end, 0);
-      // start = std::min(start, dim_value);
-      end = std::min(end, dim_value);
-      // start = std::min(start, end);
-      PADDLE_MOBILE_ENFORCE(end > start, "end should greater than start");
-      out_dims[axis] = end - start;
-    }
-  }
-  output->Resize(out_dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    LoDTensor *output_lod = reinterpret_cast<LoDTensor *>(output);
-    LoDTensor *input_lod = reinterpret_cast<LoDTensor *>(input);
-    if (axes[0] != 0) {
-      output_lod->set_lod(input_lod->lod());
-    }
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(slice, ops::SliceOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(slice, ops::SliceOp);
-#endif
-#endif  // SLICE_OP
diff --git a/mobile/src/operators/slice_op.h b/mobile/src/operators/slice_op.h
deleted file mode 100644
index 0d01705f7d7fb98f47a6960c047422e06dc68f7b..0000000000000000000000000000000000000000
--- a/mobile/src/operators/slice_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SLICE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/slice_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class SliceOp : public framework::OperatorWithKernel<
-                    DeviceType, SliceParam<DeviceType>,
-                    operators::SliceKernel<DeviceType, T>> {
- public:
-  SliceOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SliceParam<DeviceType>,
-                                      operators::SliceKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/softmax_op.cpp b/mobile/src/operators/softmax_op.cpp
deleted file mode 100644
index d88fc0a9f11542c4a88f9702c93743d2845a7663..0000000000000000000000000000000000000000
--- a/mobile/src/operators/softmax_op.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#include "operators/softmax_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-void SoftmaxOp<DeviceType, T>::InferShape() const {
-  this->param_.Out()->Resize(this->param_.InputX()->dims());
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
-#endif
-#if defined(PADDLE_MOBILE_FPGA) || defined(PADDLE_MOBILE_FPGA_KD)
-REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(softmax, ops::SoftmaxOp);
-#endif
-
-#endif
diff --git a/mobile/src/operators/softmax_op.h b/mobile/src/operators/softmax_op.h
deleted file mode 100644
index 2f9285a21d748eca90858f0d41a7998b9d2e95d2..0000000000000000000000000000000000000000
--- a/mobile/src/operators/softmax_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SOFTMAX_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/softmax_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-template <typename DeviceType, typename T>
-class SoftmaxOp : public framework::OperatorWithKernel<
-                      DeviceType, SoftmaxParam<DeviceType>,
-                      operators::SoftmaxKernel<DeviceType, T>> {
- public:
-  SoftmaxOp(const std::string &type, const VariableNameMap &inputs,
-            const VariableNameMap &outputs,
-            const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SoftmaxParam<DeviceType>,
-                                      operators::SoftmaxKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-
- private:
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/split_op.cpp b/mobile/src/operators/split_op.cpp
deleted file mode 100644
index ec82214a48551731eed1e51ef5455c39bb5f8e1e..0000000000000000000000000000000000000000
--- a/mobile/src/operators/split_op.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-#include "operators/split_op.h"
-#include <vector>
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-void SplitOp<DeviceType, T>::InferShape() const {
-  PADDLE_MOBILE_ENFORCE(this->param_.InputX() != nullptr,
-                        "Input(X) of SplitOp should not be null.");
-  //  std::string str;
-  //  str.size()
-  const auto &outs = this->param_.Outs();
-  PADDLE_MOBILE_ENFORCE(outs.size() >= 1UL,
-                        "Outputs(Out) of SplitOp should not be empty.");
-
-  auto in_dims = this->param_.InputX()->dims();
-  size_t axis = static_cast<size_t>(this->param_.Axis());
-  size_t num = static_cast<size_t>(this->param_.Num());
-
-  const auto &sections = this->param_.Sections();
-
-  const size_t outs_number = outs.size();
-  std::vector<framework::DDim> outs_dims;
-  outs_dims.reserve(outs_number);
-
-  if (num > 0) {
-    int64_t in_axis_dim = in_dims[axis];
-    PADDLE_MOBILE_ENFORCE(in_axis_dim % num == 0,
-                          "tensor split does not result"
-                          " in an equal division");
-    size_t out_axis_dim = in_axis_dim / num;
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = out_axis_dim;
-      outs_dims.push_back(dim);
-    }
-  } else if (sections.size() > 0) {
-    PADDLE_MOBILE_ENFORCE(sections.size() == outs_number,
-                          "tensor split sections size"
-                          "should be equal to output size.");
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto dim = in_dims;
-      dim[axis] = sections[i];
-      outs_dims.push_back(dim);
-    }
-  }
-
-  PADDLE_MOBILE_ENFORCE(outs_dims.size() == outs.size(),
-                        "length==dims.size()  must be true!");
-  for (int j = 0; j < outs_dims.size(); ++j) {
-    outs[j]->Resize(outs_dims[j]);
-  }
-
-  //  todo lod impl
-  //  if (axis != 0) {
-  //    // Only pass LoD when not spliting along the first dim.
-  //    for (size_t i = 0; i < outs_number; ++i) {
-  //      ctx->ShareLoD("X", "Out", 0, i);
-  //    }
-  //  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(split, ops::SplitOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(split, ops::SplitOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(split, ops::SplitOp);
-#endif
-
-#endif  // SPLIT_OP
diff --git a/mobile/src/operators/split_op.h b/mobile/src/operators/split_op.h
deleted file mode 100644
index 4801defb496c1022adc46950b8781af723b90513..0000000000000000000000000000000000000000
--- a/mobile/src/operators/split_op.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SPLIT_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/split_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class SplitOp : public framework::OperatorWithKernel<
-                    DeviceType, SplitParam<DeviceType>,
-                    operators::SplitKernel<DeviceType, T>> {
- public:
-  SplitOp(const std::string &type, const VariableNameMap &inputs,
-          const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-          framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SplitParam<DeviceType>,
-                                      operators::SplitKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  void InferShape() const override;
-};
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/sum_op.cpp b/mobile/src/operators/sum_op.cpp
deleted file mode 100644
index 1049edcbd5aa8878c586f9faa4dc5fc7b0999669..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sum_op.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#include <vector>
-
-#include "operators/sum_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void SumOp<Dtype, T>::InferShape() const {
-  auto inputs = this->param_.Inputs();
-  const size_t n = inputs.size();
-
-  std::vector<framework::DDim> inputs_dims;
-  inputs_dims.reserve(n);
-  for (int i = 0; i < n; i++) {
-    inputs_dims.push_back(inputs[i]->dims());
-  }
-
-  if (n == 1) {
-    DLOG << "Warning: sum op have only one input, "
-            "may waste memory";
-  }
-
-  framework::DDim in_dim({0});
-
-  for (auto& x_dim : inputs_dims) {
-    if (framework::product(x_dim) == 0) {
-      continue;
-    }
-    if (framework::product(in_dim) == 0) {
-      in_dim = x_dim;
-    } else {
-      PADDLE_MOBILE_ENFORCE(in_dim == x_dim,
-                            "input tensors must have same shape");
-    }
-  }
-
-  this->param_.Out()->Resize(in_dim);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(sum, ops::SumOp);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-
-#endif
diff --git a/mobile/src/operators/sum_op.h b/mobile/src/operators/sum_op.h
deleted file mode 100644
index 3ee5465fc8c55ae7f75d90124a06664225acf153..0000000000000000000000000000000000000000
--- a/mobile/src/operators/sum_op.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef SUM_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/sum_kernel.h"
-#include "operators/op_param.h"
-namespace paddle_mobile {
-namespace operators {
-using std::string;
-template <typename DeviceType, typename T>
-class SumOp : public framework::OperatorWithKernel<
-                  DeviceType, SumParam<DeviceType>,
-                  operators::SumKernel<DeviceType, T>> {
- public:
-  SumOp(const string &type, const VariableNameMap &inputs,
-        const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-        framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, SumParam<DeviceType>,
-                                      operators::SumKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, SumParam<DeviceType>,
-      operators::SumKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-
- protected:
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/top_k_op.cpp b/mobile/src/operators/top_k_op.cpp
deleted file mode 100644
index c27b24d7e86de87f839ada43b65a9415bbab005c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/top_k_op.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TOP_K_OP
-
-#include "operators/top_k_op.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void TopKOp<Dtype, T>::InferShape() const {
-  const int k = this->param_.k_;
-  auto dims = this->param_.input_->dims();
-  // should check k <= dims[-1] && k >= 1
-  dims[dims.size() - 1] = k;
-  this->param_.output_->Resize(dims);
-  this->param_.indices_->Resize(dims);
-  if (std::is_same<DeviceType<kCPU>, Dtype>::value) {
-    this->param_.output_->set_lod(this->param_.input_->lod());
-    this->param_.indices_->set_lod(this->param_.input_->lod());
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(top_k, ops::TopKOp);
-#endif
-
-#endif  // TOP_K_OP
diff --git a/mobile/src/operators/top_k_op.h b/mobile/src/operators/top_k_op.h
deleted file mode 100644
index 4c182d6ffec8a9d6685f516ff0579b0744719f5c..0000000000000000000000000000000000000000
--- a/mobile/src/operators/top_k_op.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TOP_K_OP
-
-#pragma once
-
-#include <string>
-#include "framework/operator.h"
-#include "operators/kernel/kernels.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <typename DeviceType, typename T>
-class TopKOp : public framework::OperatorWithKernel<
-                   DeviceType, TopKParam<DeviceType>,
-                   operators::TopKKernel<DeviceType, T>> {
- public:
-  TopKOp(const std::string &type, const VariableNameMap &inputs,
-         const VariableNameMap &outputs, const framework::AttributeMap &attrs,
-         framework::Scope *scope)
-      : framework::OperatorWithKernel<DeviceType, TopKParam<DeviceType>,
-                                      operators::TopKKernel<DeviceType, T>>(
-            type, inputs, outputs, attrs, scope) {}
-  // inference output shape
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif  // TOP_K_OP
diff --git a/mobile/src/operators/transpose2_op.cpp b/mobile/src/operators/transpose2_op.cpp
deleted file mode 100644
index ca9ceaafbd111a2647bf6657e2839ebfb1064d73..0000000000000000000000000000000000000000
--- a/mobile/src/operators/transpose2_op.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/transpose2_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void Transpose2Op<Dtype, T>::InferShape() const {
-  auto input_x_dims = this->param_.InputX()->dims();
-  auto axis = this->param_.Axis();
-
-  size_t x_dims_size = input_x_dims.size();
-  size_t axis_size = axis.size();
-
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    bool shouldResize = true;
-    int diff_dim = 0;
-    if (axis_size > 4) {
-      for (int i = 0; i < axis_size - 4; ++i) {
-        if (axis[i] != i) {
-          shouldResize = false;
-          break;
-        } else {
-          diff_dim++;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int> temp_axis_dims;
-        temp_axis_dims.reserve(static_cast<size_t>(4));
-        for (int i = axis_size - 4; i < axis_size; ++i) {
-          temp_axis_dims.push_back(axis[i] - diff_dim);
-        }
-        axis.resize(4);
-        axis.clear();
-        axis.insert(axis.begin(), temp_axis_dims.begin(), temp_axis_dims.end());
-      }
-    }
-
-    auto input_dim_size = input_x_dims.size();
-    shouldResize = true;
-    if (input_dim_size > 4) {
-      for (int i = 0; i < input_dim_size - 4; ++i) {
-        if (input_x_dims[i] != 0 && input_x_dims[i] != 1) {
-          shouldResize = false;
-          break;
-        }
-      }
-      if (shouldResize) {
-        std::vector<int64_t> temp_intput_dims;
-        temp_intput_dims.reserve(static_cast<size_t>(4));
-        for (int i = input_dim_size - 4; i < input_dim_size; ++i) {
-          temp_intput_dims.push_back(input_x_dims[i]);
-        }
-        framework::DDim temp_ddim = framework::make_ddim(temp_intput_dims);
-        this->param_.InputX()->Resize(temp_ddim);
-      }
-    }
-
-    axis_size = axis.size();
-    input_x_dims = this->param_.InputX()->dims();
-    x_dims_size = input_x_dims.size();
-  }
-
-  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
-                        "input_dims must "
-                        "be equal to the axis_size. ")
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    PADDLE_MOBILE_ENFORCE(
-        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-        "Each element of Attribute axis should be a unique value "
-        "range from 0 to (dims - 1), "
-        "where the dims is the axis's size");
-  }
-  framework::DDim out_dims(input_x_dims);
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = input_x_dims[axis[i]];
-  }
-  this->param_.Out()->Resize(out_dims);
-  std::vector<int64_t> xshape_dims(input_x_dims.size() + 1, 0);
-  for (int i = 0; i < input_x_dims.size(); ++i) {
-    xshape_dims[i + 1] = input_x_dims[i];
-  }
-  this->param_.OutputXShape()->Resize(framework::make_ddim(xshape_dims));
-  if (std::is_same<DeviceType<kGPU_CL>, Dtype>::value) {
-    this->param_.OutputXShape()->Resize(input_x_dims);
-  }
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(transpose2, ops::Transpose2Op);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-REGISTER_OPERATOR_FPGA(transpose2, ops::Transpose2Op);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(transpose2, ops::Transpose2Op);
-#endif
-#endif  // TRANSPOSE_OP
diff --git a/mobile/src/operators/transpose2_op.h b/mobile/src/operators/transpose2_op.h
deleted file mode 100644
index 2552688ca6a3a0bb102842e7edbcc1ebdc777662..0000000000000000000000000000000000000000
--- a/mobile/src/operators/transpose2_op.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE2_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/transpose2_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class Transpose2Op : public framework::OperatorWithKernel<
-                         DeviceType, Transpose2Param<DeviceType>,
-                         operators::Transpose2Kernel<DeviceType, T>> {
- public:
-  Transpose2Op(const std::string &type, const VariableNameMap &inputs,
-               const VariableNameMap &outputs,
-               const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, Transpose2Param<DeviceType>,
-            operators::Transpose2Kernel<DeviceType, T>>(type, inputs, outputs,
-                                                        attrs, scope) {}
-
-  using framework::OperatorWithKernel<
-      DeviceType, Transpose2Param<DeviceType>,
-      operators::Transpose2Kernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/operators/transpose_op.cpp b/mobile/src/operators/transpose_op.cpp
deleted file mode 100644
index 820a4e354dba9c2ffa03b422c590b409f0c3e687..0000000000000000000000000000000000000000
--- a/mobile/src/operators/transpose_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-
-#include <vector>
-
-#include "common/enforce.h"
-#include "operators/transpose_op.h"
-namespace paddle_mobile {
-namespace operators {
-
-template <typename Dtype, typename T>
-void TransposeOp<Dtype, T>::InferShape() const {
-  auto input_x_dims = this->param_.InputX()->dims();
-  auto axis = this->param_.Axis();
-
-  size_t x_dims_size = input_x_dims.size();
-  size_t axis_size = axis.size();
-
-  PADDLE_MOBILE_ENFORCE((x_dims_size == axis_size),
-                        "input_dims must "
-                        "be equal to the axis_size. ")
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    PADDLE_MOBILE_ENFORCE(
-        axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
-        "Each element of Attribute axis should be a unique value "
-        "range from 0 to (dims - 1), "
-        "where the dims is the axis's size");
-  }
-  framework::DDim out_dims(input_x_dims);
-  for (size_t i = 0; i < axis_size; i++) {
-    out_dims[i] = input_x_dims[axis[i]];
-  }
-  this->param_.Out()->Resize(out_dims);
-}
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
-#endif
-#ifdef PADDLE_MOBILE_CL
-REGISTER_OPERATOR_CL(transpose, ops::TransposeOp);
-#endif
-
-#endif  // TRANSPOSE_OP
diff --git a/mobile/src/operators/transpose_op.h b/mobile/src/operators/transpose_op.h
deleted file mode 100644
index cf03cb382570ccb8d7278546ab3b191e70a0792d..0000000000000000000000000000000000000000
--- a/mobile/src/operators/transpose_op.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef TRANSPOSE_OP
-
-#pragma once
-
-#include <string>
-
-#include "framework/operator.h"
-#include "operators/kernel/transpose_kernel.h"
-#include "operators/op_param.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-using paddle_mobile::framework::Tensor;
-
-template <typename DeviceType, typename T>
-class TransposeOp : public framework::OperatorWithKernel<
-                        DeviceType, TransposeParam<DeviceType>,
-                        operators::TransposeKernel<DeviceType, T>> {
- public:
-  TransposeOp(const std::string &type, const VariableNameMap &inputs,
-              const VariableNameMap &outputs,
-              const framework::AttributeMap &attrs, framework::Scope *scope)
-      : framework::OperatorWithKernel<
-            DeviceType, TransposeParam<DeviceType>,
-            operators::TransposeKernel<DeviceType, T>>(type, inputs, outputs,
-                                                       attrs, scope) {}
-  void InferShape() const override;
-};
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
diff --git a/mobile/src/pass/memory_optimize.cpp b/mobile/src/pass/memory_optimize.cpp
deleted file mode 100644
index d9cfa1389955ed503f0aae12e3251e01d2fe9a13..0000000000000000000000000000000000000000
--- a/mobile/src/pass/memory_optimize.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "pass/memory_optimize.h"
-#include <algorithm>
-#include "framework/lod_tensor.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-void MemoryOptPass::AppendBlockVars(const framework::BlockDesc *block) {
-  // block_vars_.clear();
-  for (const auto var : block->Vars()) {
-    block_vars_[var->Name()] = var.get();
-  }
-}
-
-bool MemoryOptPass::IsPersistable(const std::string name) {
-  const auto it = block_vars_.find(name);
-  if (it != block_vars_.end()) {
-    return it->second->Persistable();
-  }
-  return false;
-}
-
-VarNode *MemoryOptPass::CreateNode(const std::string name) {
-  auto it = created_nodes_.find(name);
-  if (it != created_nodes_.end()) {
-    ++(it->second->count);
-    return it->second;
-  }
-  VarNode *var = new VarNode;
-  var->name = name;
-  var->count = 1;
-  var->visited = false;
-  created_nodes_[name] = var;
-  return var;
-}
-
-void MemoryOptPass::operator()(
-    const framework::ProgramDesc *program, framework::Scope *scope,
-    MemoryOptimizationLevel memory_optimization_level) {
-  const auto &blocks = program->Blocks();
-  for (const auto &block : blocks) {
-    // access all variables in each block
-    AppendBlockVars(block.get());
-
-    reused_nodes_.clear();
-    // collect all not persistable variables, and accumulate
-    // it's reference count
-    std::stack<VarNode *> empty_var_nodes;
-    analysis_nodes_.swap(empty_var_nodes);
-
-    std::vector<std::string> exclude_var_names;
-    for (const auto &op : block->Ops()) {
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input)) {
-            if (memory_optimization_level == MemoryOptimizationWithoutFeeds) {
-              if (op->Type() == "feed") {
-                exclude_var_names.push_back(input);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    std::vector<VarNode *> fetch_var_nodes;
-    for (const auto &op : block->Ops()) {
-      DLOG << "op_desc->Type(): " << op->Type();
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
-            VarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        input) == exclude_var_names.end()) {
-            DLOG << "input: " << input;
-            VarNode *node = CreateNode(input);
-            analysis_nodes_.push(node);
-            if (op->Type() == "fetch") {
-              fetch_var_nodes.push_back(node);
-            }
-          }
-        }
-      }
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
-            VarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-    }
-
-    // apply optimize
-    while (!analysis_nodes_.empty()) {
-      auto *node = analysis_nodes_.top();
-      analysis_nodes_.pop();
-      // only not visited node can reuse memory between other nodes
-      // with 0 count which indicate they will not be used any more
-      if (!node->visited) {
-        bool reused = false;
-        // find out a possable reuse list
-        for (auto &list : reused_nodes_) {
-          if (list.back()->count == 0 &&
-              std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
-                        list.back()) == fetch_var_nodes.end()) {
-            list.push_back(node);
-            reused = true;
-            break;
-          }
-        }
-        // create new list if can't find a reused list
-        if (!reused) {
-          std::vector<VarNode *> list;
-          list.push_back(node);
-          reused_nodes_.push_back(std::move(list));
-        }
-      }
-      node->visited = true;
-      node->count -= 1;
-    }
-
-    // shared data within all variables in the same reused list
-    for (const auto &list : reused_nodes_) {
-      DLOG << "\n";
-      DLOG << "share memory within these variables";
-      std::string name = list[0]->name;
-      auto *reused_var = scope->Var(name);
-      auto *reuse_tensor =
-          reused_var->template GetMutable<framework::LoDTensor>();
-      reuse_tensor->mutable_data<float>();
-      for (const auto &node : list) {
-        DLOG << node->name;
-        auto *var = scope->Var(node->name);
-        auto *tensor = var->template GetMutable<framework::LoDTensor>();
-        tensor->ShareHolderWith(*reuse_tensor);
-      }
-    }
-  }
-}
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/memory_optimize.h b/mobile/src/pass/memory_optimize.h
deleted file mode 100644
index f0171c5ba6951ace2efac2fc5840b8878df3d1de..0000000000000000000000000000000000000000
--- a/mobile/src/pass/memory_optimize.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stack>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "framework/program/program.h"
-#include "pass/pass_base.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-typedef struct {
-  std::string name;  // variable name
-  int count;         // reference count
-  bool visited;
-} VarNode;
-
-// MemoryOptPass will analyze the program, and reuse memory between
-// variables as much as possible
-class MemoryOptPass : public PassBase {
- public:
-  MemoryOptPass() {}
-  virtual ~MemoryOptPass() {
-    for (auto &it : created_nodes_) {
-      delete it.second;
-    }
-  }
-
-  void operator()(const framework::ProgramDesc *program,
-                  framework::Scope *scope,
-                  MemoryOptimizationLevel memory_optimization_level);
-
-  void AppendBlockVars(const framework::BlockDesc *block);
-
-  bool IsPersistable(const std::string name);
-
-  VarNode *CreateNode(const std::string name);
-
- private:
-  std::stack<VarNode *> analysis_nodes_;
-  std::vector<std::vector<VarNode *>> reused_nodes_;
-  std::unordered_map<std::string, VarNode *> created_nodes_;
-  std::unordered_map<std::string, framework::VarDesc *> block_vars_;
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/memory_optimize_cl.cpp b/mobile/src/pass/memory_optimize_cl.cpp
deleted file mode 100644
index 53bb675f17b2bae9c3954fa57894b8f73fc611fe..0000000000000000000000000000000000000000
--- a/mobile/src/pass/memory_optimize_cl.cpp
+++ /dev/null
@@ -1,270 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_CL
-#include "pass/memory_optimize_cl.h"
-#include <algorithm>
-#include <utility>
-#include "framework/cl/cl_image.h"
-#include "framework/lod_tensor.h"
-namespace paddle_mobile {
-namespace pass {
-
-void MemoryOptPassCl::AppendBlockVars(const framework::BlockDesc *block) {
-  // block_vars_.clear();
-  for (const auto var : block->Vars()) {
-    block_vars_[var->Name()] = var.get();
-  }
-}
-
-bool MemoryOptPassCl::IsPersistable(const std::string name) {
-  const auto it = block_vars_.find(name);
-  if (it != block_vars_.end()) {
-    return it->second->Persistable();
-  }
-  return false;
-}
-
-ClVarNode *MemoryOptPassCl::CreateNode(const std::string name) {
-  auto it = created_nodes_.find(name);
-  if (it != created_nodes_.end()) {
-    ++(it->second->count);
-    return it->second;
-  }
-  ClVarNode *var = new ClVarNode;
-  var->name = name;
-  var->count = 1;
-  var->visited = false;
-  created_nodes_[name] = var;
-  return var;
-}
-
-void MemoryOptPassCl::operator()(
-    const framework::ProgramDesc *program, framework::Scope *scope,
-    MemoryOptimizationLevel memory_optimization_level,
-    framework::DDim target_dims) {
-  const auto &blocks = program->Blocks();
-  for (const auto &block : blocks) {
-    // access all variables in each block
-    AppendBlockVars(block.get());
-    reused_nodes_.clear();
-    // collect all not persistable variables, and accumulate
-    // it's reference count
-    std::stack<ClVarNode *> empty_var_nodes;
-    analysis_nodes_.swap(empty_var_nodes);
-
-    std::vector<std::string> exclude_var_names;
-    for (const auto &op : block->Ops()) {
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          if (!IsPersistable(input)) {
-            if (memory_optimization_level == MemoryOptimizationWithoutFeeds) {
-              if (op->Type() == "feed") {
-                exclude_var_names.push_back(input);
-              }
-            }
-          }
-        }
-      }
-    }
-
-    std::vector<ClVarNode *> fetch_var_nodes;
-    for (const auto &op : block->Ops()) {
-      LOG(kNO_LOG) << "op_desc->Type(): " << op->Type();
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          // not a persistable and not a exclude one ,then add it to
-          // analysis_nodes
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            LOG(kNO_LOG) << "output: " << output;
-            ClVarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-      for (const auto &inputs : op->GetInputs()) {
-        for (const auto &input : inputs.second) {
-          // not a persistable and not a exclude one ,then add it to
-          // analysis_nodes
-          if (!IsPersistable(input) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        input) == exclude_var_names.end()) {
-            LOG(kNO_LOG) << "input: " << input;
-            ClVarNode *node = CreateNode(input);
-            analysis_nodes_.push(node);
-            if (op->Type() == "fetch") {
-              fetch_var_nodes.push_back(node);
-            }
-          }
-        }
-      }
-      for (const auto &outputs : op->GetOutputs()) {
-        for (const auto &output : outputs.second) {
-          if (!IsPersistable(output) &&
-              std::find(exclude_var_names.begin(), exclude_var_names.end(),
-                        output) == exclude_var_names.end()) {
-            LOG(kNO_LOG) << "output: " << output;
-            ClVarNode *node = CreateNode(output);
-            analysis_nodes_.push(node);
-          }
-        }
-      }
-    }
-
-    // apply optimize
-    while (!analysis_nodes_.empty()) {
-      auto *node = analysis_nodes_.top();
-      analysis_nodes_.pop();
-      // only not visited node can reuse memory between other nodes
-      // with 0 count which indicate they will not be used any more
-      if (!node->visited) {
-        bool reused = false;
-        // find out a possable reuse list
-        for (auto &list : reused_nodes_) {
-          // reference count = 0 and not in fetch list
-          if (list.back()->count == 0 &&
-              std::find(fetch_var_nodes.begin(), fetch_var_nodes.end(),
-                        list.back()) == fetch_var_nodes.end()) {
-            list.push_back(node);
-            reused = true;
-            break;
-          }
-        }
-        // create new list if can't find a reused list
-        if (!reused) {
-          std::vector<ClVarNode *> list;
-          list.push_back(node);
-          reused_nodes_.push_back(std::move(list));
-        }
-      }
-      node->visited = true;
-      node->count -= 1;
-    }
-    // shared data within all variables in the same reused list
-    ShareData(scope, memory_optimization_level, target_dims);
-  }
-}
-
-void MemoryOptPassCl::ShareData(
-    framework::Scope *scope, MemoryOptimizationLevel memory_optimization_level,
-    framework::DDim target_dims)
-    const {  // shared data within all variables in the same reused list
-  cl_context context = scope->GetCLScpoe()->Context();
-  cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
-
-  for (const auto &list : reused_nodes_) {
-    LOG(kNO_LOG) << "\n";
-    LOG(kNO_LOG) << "gpu . share memory within these variables";
-    int64_t x_based_max_numl = -1;
-    int64_t y_based_max_numl = -1;
-    int64_t x_based_max_x = -1;
-    int64_t x_based_max_y = -1;
-    int64_t y_based_max_x = -1;
-    int64_t y_based_max_y = -1;
-
-    framework::CLImage *x_based_reuse_tensor = nullptr;
-    framework::CLImage *y_based_reuse_tensor = nullptr;
-    for (const auto &node : list) {
-      auto *var = scope->Var(node->name);
-      auto *tensor = var->template GetMutable<framework::CLImage>();
-      const int64_t numl = tensor->numel();
-      auto origin_tensor_dims = tensor->dims();
-
-      // for super ,hack origin dims
-      if (target_dims.size() == 4) {
-        PADDLE_MOBILE_ENFORCE(origin_tensor_dims.size() == 4,
-                              "tensor dims must be equal to 4");
-        origin_tensor_dims = {origin_tensor_dims[0], origin_tensor_dims[1],
-                              target_dims[2], target_dims[3]};
-        tensor->Resize(origin_tensor_dims);
-      }
-
-      const framework::DDim &image_dims =
-          normal_converter->InitImageDimInfoWith(origin_tensor_dims);
-      int64_t image_dims_x = image_dims[0];
-      int64_t image_dims_y = image_dims[1];
-      // classify memory into two parts
-      if (image_dims_x > image_dims_y) {
-        // choose a biggest tensor for reuse
-        if (x_based_max_numl < numl) {
-          x_based_max_numl = numl;
-          x_based_reuse_tensor = tensor;
-        }
-        x_based_max_x = std::max(x_based_max_x, image_dims_x);
-        x_based_max_y = std::max(x_based_max_y, image_dims_y);
-      } else {
-        // choose a biggest tensor for reuse
-        if (y_based_max_numl < numl) {
-          y_based_max_numl = numl;
-          y_based_reuse_tensor = tensor;
-        }
-        y_based_max_x = std::max(y_based_max_x, image_dims_x);
-        y_based_max_y = std::max(y_based_max_y, image_dims_y);
-      }
-    }
-
-    PADDLE_MOBILE_ENFORCE(
-        x_based_reuse_tensor != nullptr || y_based_reuse_tensor != nullptr,
-        "x_based_reuse_tensor and y_based_reuse_tensor can not be null at same "
-        "time");
-
-    // init x based shared cl mem
-    if (x_based_reuse_tensor != nullptr) {
-      const framework::DDim &x_reuse_dims = x_based_reuse_tensor->dims();
-      x_based_reuse_tensor->InitFakeSizeImage(
-          context, command_queue, x_reuse_dims, {x_based_max_x, x_based_max_y});
-    }
-
-    // init y based shared cl mem
-    if (y_based_reuse_tensor != nullptr) {
-      const framework::DDim &y_reuse_dims = y_based_reuse_tensor->dims();
-      y_based_reuse_tensor->InitFakeSizeImage(
-          context, command_queue, y_reuse_dims, {y_based_max_x, y_based_max_y});
-    }
-    // share mem
-    for (const auto &node : list) {
-      auto *var = scope->Var(node->name);
-      auto *tensor = var->template GetMutable<framework::CLImage>();
-      auto need_dims = tensor->dims();
-
-      // for super ,hack origin dims
-      if (target_dims.size() == 4) {
-        need_dims = {need_dims[0], need_dims[1], target_dims[2],
-                     target_dims[3]};
-      }
-
-      const framework::DDim &need_image_dims =
-          normal_converter->InitImageDimInfoWith(need_dims);
-      int64_t image_dims_x = need_image_dims[0];
-      int64_t image_dims_y = need_image_dims[1];
-
-      if (image_dims_x > image_dims_y) {
-        PADDLE_MOBILE_ENFORCE(x_based_reuse_tensor != nullptr,
-                              "x_based_reuse_tensor not null here");
-        tensor->InitWithExistMem(context, command_queue, need_dims,
-                                 *x_based_reuse_tensor);
-      } else {
-        PADDLE_MOBILE_ENFORCE(y_based_reuse_tensor != nullptr,
-                              "y_based_reuse_tensor not null here");
-        tensor->InitWithExistMem(context, command_queue, need_dims,
-                                 *y_based_reuse_tensor);
-      }
-    }
-  }
-}
-
-}  // namespace pass
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/pass/memory_optimize_cl.h b/mobile/src/pass/memory_optimize_cl.h
deleted file mode 100644
index aafdda4b34cce4db7be1e0bc836b83401bdedde1..0000000000000000000000000000000000000000
--- a/mobile/src/pass/memory_optimize_cl.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_MOBILE_CL
-
-#pragma once
-
-#include <stack>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "framework/cl/cl_image_converter.h"
-#include "framework/lod_tensor.h"
-#include "framework/program/program.h"
-#include "pass/pass_base.h"
-
-// use for opencl
-namespace paddle_mobile {
-namespace pass {
-
-typedef struct {
-  std::string name;  // variable name
-  int count;         // reference count
-  bool visited;
-} ClVarNode;
-
-// MemoryOptPass will analyze the program, and reuse memory between
-// variables as much as possible
-class MemoryOptPassCl : public PassBase {
- public:
-  MemoryOptPassCl() {}
-  virtual ~MemoryOptPassCl() {
-    for (auto &it : created_nodes_) {
-      delete it.second;
-    }
-    delete normal_converter;
-  }
-
-  void operator()(const framework::ProgramDesc *program,
-                  framework::Scope *scope,
-                  MemoryOptimizationLevel memory_optimization_level,
-                  framework::DDim dims = {});
-
-  void AppendBlockVars(const framework::BlockDesc *block);
-
-  bool IsPersistable(const std::string name);
-
-  ClVarNode *CreateNode(const std::string name);
-
-  void ShareData(framework::Scope *scope,
-                 MemoryOptimizationLevel memory_optimization_level,
-                 framework::DDim dims) const;
-
- private:
-  std::stack<ClVarNode *> analysis_nodes_;
-  std::vector<std::vector<ClVarNode *>> reused_nodes_;
-  std::unordered_map<std::string, ClVarNode *> created_nodes_;
-  std::unordered_map<std::string, framework::VarDesc *> block_vars_;
-  paddle_mobile::framework::CLImageConverterNormal *normal_converter =
-      new paddle_mobile::framework::CLImageConverterNormal();
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/src/pass/model_obfuscate.cpp b/mobile/src/pass/model_obfuscate.cpp
deleted file mode 100644
index 913b93af25b4cd4b0de0e17571eb62af33401884..0000000000000000000000000000000000000000
--- a/mobile/src/pass/model_obfuscate.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "pass/model_obfuscate.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-ModelObfuscatePass::ModelObfuscatePass(std::string key) {
-  for (auto c : key) {
-    acc *= base;
-    acc += (int)c;
-    acc %= stride;
-  }
-  acc += stride;
-}
-
-void ModelObfuscatePass::convert_data(char *data, int len) {
-  for (int i = 0; i < len; i += acc) {
-    data[i] = 255 - data[i];
-  }
-}
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/model_obfuscate.h b/mobile/src/pass/model_obfuscate.h
deleted file mode 100644
index 6c2912e05afeb757f1671ce4feea5915ad11b046..0000000000000000000000000000000000000000
--- a/mobile/src/pass/model_obfuscate.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include "pass/pass_base.h"
-
-namespace paddle_mobile {
-namespace pass {
-
-class ModelObfuscatePass : public PassBase {
- public:
-  ModelObfuscatePass(std::string key);
-  void convert_data(char *data, int len);
-  int version = 1;
-
- private:
-  int acc = 0;
-  int base = 17;
-  int stride = 100;
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/pass/pass_base.h b/mobile/src/pass/pass_base.h
deleted file mode 100644
index 925fdb7d506c2742d14c2543c9d855116adcb7a8..0000000000000000000000000000000000000000
--- a/mobile/src/pass/pass_base.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace paddle_mobile {
-namespace pass {
-
-class PassBase {
- public:
-  PassBase() {}
-  virtual ~PassBase() {}
-};
-
-}  // namespace pass
-}  // namespace paddle_mobile
diff --git a/mobile/src/protobuf-c/protobuf-c.cpp b/mobile/src/protobuf-c/protobuf-c.cpp
deleted file mode 100644
index 8e739df43cf66fb001b01e465d5552f429e96190..0000000000000000000000000000000000000000
--- a/mobile/src/protobuf-c/protobuf-c.cpp
+++ /dev/null
@@ -1,2249 +0,0 @@
-/*
- * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * Support library for `protoc-c` generated code.
- *
- * This file implements the public API used by the code generated
- * by `protoc-c`.
- *
- * \authors Dave Benson and the protobuf-c authors
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- */
-
-/**
- * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
- * even on 64-bit platforms (uint64_size, PaddleMobile__Framework__uint64_pack,
- * PaddleMobile__Framework__parse_uint64).
- *
- * \todo Use size_t consistently.
- */
-
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
-
-#include "protobuf-c.h"
-
-#define TRUE 1
-#define FALSE 0
-
-#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
-
-/* Workaround for Microsoft compilers. */
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-/**
- * \defgroup internal Internal functions and macros
- *
- * These are not exported by the library but are useful to developers working
- * on `libprotobuf-c` itself.
- */
-
-/**
- * \defgroup macros Utility macros for manipulating structures
- *
- * Macros and constants used to manipulate the base "classes" generated by
- * `protobuf-c`. They also define limits and check correctness.
- *
- * \ingroup internal
- * @{
- */
-
-/** The maximum length of a 64-bit integer in varint encoding. */
-#define MAX_UINT64_ENCODED_SIZE 10
-
-#ifndef PROTOBUF_C_UNPACK_ERROR
-#define PROTOBUF_C_UNPACK_ERROR(...)
-#endif
-
-const char PaddleMobile__Framework__protobuf_c_empty_string[] = "";
-
-/**
- * Internal `PaddleMobile__Framework__ProtobufCMessage` manipulation macro.
- *
- * Base macro for manipulating a `PaddleMobile__Framework__ProtobufCMessage`.
- * Used by STRUCT_MEMBER() and STRUCT_MEMBER_PTR().
- */
-#define STRUCT_MEMBER_P(struct_p, struct_offset) \
-  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
-
-/**
- * Return field in a `PaddleMobile__Framework__ProtobufCMessage` based on
- * offset.
- *
- * Take a pointer to a `PaddleMobile__Framework__ProtobufCMessage` and find the
- * field at the offset. Cast it to the passed type.
- */
-#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
-  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/**
- * Return field in a `PaddleMobile__Framework__ProtobufCMessage` based on
- * offset.
- *
- * Take a pointer to a `PaddleMobile__Framework__ProtobufCMessage` and find the
- * field at the offset. Cast it to a pointer to the passed type.
- */
-#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
-  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/* Assertions for magic numbers. */
-
-#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE(message) \
-  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
-
-#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
-
-/**@}*/
-
-/* --- version --- */
-
-const char *PaddleMobile__Framework__protobuf_c_version(void) {
-  return PROTOBUF_C_VERSION;
-}
-
-uint32_t PaddleMobile__Framework__protobuf_c_version_number(void) {
-  return PROTOBUF_C_VERSION_NUMBER;
-}
-
-/* --- allocator --- */
-
-static void *PaddleMobile__Framework__system_alloc(void *allocator_data,
-                                                   size_t size) {
-  return malloc(size);
-}
-
-static void PaddleMobile__Framework__system_free(void *allocator_data,
-                                                 void *data) {
-  free(data);
-}
-
-static inline void *PaddleMobile__Framework__do_alloc(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t size) {
-  return allocator->alloc(allocator->allocator_data, size);
-}
-
-static inline void PaddleMobile__Framework__do_free(
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, void *data) {
-  if (data != NULL) allocator->free(allocator->allocator_data, data);
-}
-
-/*
- * This allocator uses the system's malloc() and free(). It is the default
- * allocator used if NULL is passed as the
- * PaddleMobile__Framework__ProtobufCAllocator to an exported function.
- */
-static PaddleMobile__Framework__ProtobufCAllocator protobuf_c__allocator = {
-    .alloc = &PaddleMobile__Framework__system_alloc,
-    .free = &PaddleMobile__Framework__system_free,
-    .allocator_data = NULL,
-};
-
-/* === buffer-simple === */
-
-void PaddleMobile__Framework__protobuf_c_buffer_simple_append(
-    PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len,
-    const uint8_t *data) {
-  PaddleMobile__Framework__ProtobufCBufferSimple *simp =
-      (PaddleMobile__Framework__ProtobufCBufferSimple *)buffer;
-  size_t new_len = simp->len + len;
-
-  if (new_len > simp->alloced) {
-    PaddleMobile__Framework__ProtobufCAllocator *allocator = simp->allocator;
-    size_t new_alloced = simp->alloced * 2;
-    uint8_t *new_data;
-
-    if (allocator == NULL) allocator = &protobuf_c__allocator;
-    while (new_alloced < new_len) new_alloced += new_alloced;
-    new_data =
-        (uint8_t *)PaddleMobile__Framework__do_alloc(allocator, new_alloced);
-    if (!new_data) return;
-    memcpy(new_data, simp->data, simp->len);
-    if (simp->must_free_data)
-      PaddleMobile__Framework__do_free(allocator, simp->data);
-    else
-      simp->must_free_data = TRUE;
-    simp->data = new_data;
-    simp->alloced = new_alloced;
-  }
-  memcpy(simp->data + simp->len, data, len);
-  simp->len = new_len;
-}
-
-/**
- * \defgroup packedsz
- * PaddleMobile__Framework__protobuf_c_message_get_packed_size() implementation
- *
- * Routines mainly used by
- * PaddleMobile__Framework__protobuf_c_message_get_packed_size().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Return the number of bytes required to store the tag for the field. Includes
- * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
- *
- * \param number
- *      Field tag to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t get_tag_size(uint32_t number) {
-  if (number < (1UL << 4)) {
-    return 1;
-  } else if (number < (1UL << 11)) {
-    return 2;
-  } else if (number < (1UL << 18)) {
-    return 3;
-  } else if (number < (1UL << 25)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length unsigned
- * 32-bit integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint32_size(uint32_t v) {
-  if (v < (1UL << 7)) {
-    return 1;
-  } else if (v < (1UL << 14)) {
-    return 2;
-  } else if (v < (1UL << 21)) {
-    return 3;
-  } else if (v < (1UL << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length signed 32-bit
- * integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t int32_size(int32_t v) {
-  if (v < 0) {
-    return 10;
-  } else if (v < (1L << 7)) {
-    return 1;
-  } else if (v < (1L << 14)) {
-    return 2;
-  } else if (v < (1L << 21)) {
-    return 3;
-  } else if (v < (1L << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint32_t zigzag32(int32_t v) {
-  if (v < 0)
-    return (-(uint32_t)v) * 2 - 1;
-  else
-    return (uint32_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 32-bit integer,
- * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
-
-/**
- * Return the number of bytes required to store a 64-bit unsigned integer in
- * base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint64_size(uint64_t v) {
-  uint32_t upper_v = (uint32_t)(v >> 32);
-
-  if (upper_v == 0) {
-    return uint32_size((uint32_t)v);
-  } else if (upper_v < (1UL << 3)) {
-    return 5;
-  } else if (upper_v < (1UL << 10)) {
-    return 6;
-  } else if (upper_v < (1UL << 17)) {
-    return 7;
-  } else if (upper_v < (1UL << 24)) {
-    return 8;
-  } else if (upper_v < (1UL << 31)) {
-    return 9;
-  } else {
-    return 10;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint64_t zigzag64(int64_t v) {
-  if (v < 0)
-    return (-(uint64_t)v) * 2 - 1;
-  else
-    return (uint64_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 64-bit integer,
- * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
-
-/**
- * Calculate the serialized size of a single required message field, including
- * the space needed by the preceding tag.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__required_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const void *member) {
-  size_t rv = get_tag_size(field->id);
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      return rv + sint32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      return rv + int32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_UINT32:
-      return rv + uint32_size(*(const uint32_t *)member);
-    case PROTOBUF_C_TYPE_SINT64:
-      return rv + sint64_size(*(const int64_t *)member);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      return rv + uint64_size(*(const uint64_t *)member);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return rv + 1;
-    case PROTOBUF_C_TYPE_FLOAT:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_STRING: {
-      const char *str = *(char *const *)member;
-      size_t len = str ? strlen(str) : 0;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      size_t len =
-          ((const PaddleMobile__Framework__ProtobufCBinaryData *)member)->len;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      const PaddleMobile__Framework__ProtobufCMessage *msg =
-          *(PaddleMobile__Framework__ProtobufCMessage *const *)member;
-      size_t subrv =
-          msg ? PaddleMobile__Framework__protobuf_c_message_get_packed_size(msg)
-              : 0;
-      return rv + uint32_size(subrv) + subrv;
-    }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Calculate the serialized size of a single oneof message field, including
- * the space needed by the preceding tag. Returns 0 if the oneof field isn't
- * selected or is not set.
- *
- * \param field
- *      Field descriptor for member.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__oneof_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    uint32_t oneof_case, const void *member) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return PaddleMobile__Framework__required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of a single optional message field, including
- * the space needed by the preceding tag. Returns 0 if the optional field isn't
- * set.
- *
- * \param field
- *      Field descriptor for member.
- * \param has
- *      True if the field exists, false if not.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__optional_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const protobuf_c_boolean has, const void *member) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return PaddleMobile__Framework__required_field_get_packed_size(field, member);
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__field_is_zeroish(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const void *member) {
-  protobuf_c_boolean ret = FALSE;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_BOOL:
-      ret = (0 == *(const protobuf_c_boolean *)member);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      ret = (0 == *(const uint32_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      ret = (0 == *(const uint64_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_FLOAT:
-      ret = (0 == *(const float *)member);
-      break;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      ret = (0 == *(const double *)member);
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      ret = (NULL == *(const char *const *)member) ||
-            ('\0' == **(const char *const *)member);
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      ret = (NULL == *(const void *const *)member);
-      break;
-    default:
-      ret = TRUE;
-      break;
-  }
-
-  return ret;
-}
-
-/**
- * Calculate the serialized size of a single unlabeled message field, including
- * the space needed by the preceding tag. Returns 0 if the field isn't set or
- * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
- * Unlabeled fields are supported only in proto3.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__unlabeled_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    const void *member) {
-  if (PaddleMobile__Framework__field_is_zeroish(field, member)) return 0;
-  return PaddleMobile__Framework__required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of repeated message fields, which may consist
- * of any number of values (including 0). Includes the space needed by the
- * preceding tags (as needed).
- *
- * \param field
- *      Field descriptor for member.
- * \param count
- *      Number of repeated field members.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t PaddleMobile__Framework__repeated_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field,
-    size_t count, const void *member) {
-  size_t header_size;
-  size_t rv = 0;
-  unsigned i;
-  void *array = *(void *const *)member;
-
-  if (count == 0) return 0;
-  header_size = get_tag_size(field->id);
-  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      rv += 4 * count;
-      break;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      rv += 8 * count;
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      rv += count;
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      for (i = 0; i < count; i++) {
-        size_t len = strlen(((char **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            ((PaddleMobile__Framework__ProtobufCBinaryData *)array)[i].len;
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_MESSAGE:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            PaddleMobile__Framework__protobuf_c_message_get_packed_size(
-                ((PaddleMobile__Framework__ProtobufCMessage **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-  }
-
-  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
-    header_size += uint32_size(rv);
-  return header_size + rv;
-}
-
-/**
- * Calculate the serialized size of an unknown field, i.e. one that is passed
- * through mostly uninterpreted. This is required for forward compatibility if
- * new fields are added to the message descriptor.
- *
- * \param field
- *      Unknown field type.
- * \return
- *      Number of bytes required.
- */
-static inline size_t PaddleMobile__Framework__unknown_field_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCMessageUnknownField *field) {
-  return get_tag_size(field->tag) + field->len;
-}
-
-/**@}*/
-
-/*
- * Calculate the serialized size of the message.
- */
-size_t PaddleMobile__Framework__protobuf_c_message_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCMessage *message) {
-  unsigned i;
-  size_t rv = 0;
-
-  ASSERT_IS_MESSAGE(message);
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-        message->descriptor->fields + i;
-    const void *member = ((const char *)message) + field->offset;
-    const void *qmember = ((const char *)message) + field->quantifier_offset;
-
-    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      rv += PaddleMobile__Framework__required_field_get_packed_size(field,
-                                                                    member);
-    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
-                field->label == PROTOBUF_C_LABEL_NONE) &&
-               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
-      rv += PaddleMobile__Framework__oneof_field_get_packed_size(
-          field, *(const uint32_t *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
-      rv += PaddleMobile__Framework__optional_field_get_packed_size(
-          field, *(protobuf_c_boolean *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
-      rv += PaddleMobile__Framework__unlabeled_field_get_packed_size(field,
-                                                                     member);
-    } else {
-      rv += PaddleMobile__Framework__repeated_field_get_packed_size(
-          field, *(const size_t *)qmember, member);
-    }
-  }
-  for (i = 0; i < message->n_unknown_fields; i++)
-    rv += PaddleMobile__Framework__unknown_field_get_packed_size(
-        &message->unknown_fields[i]);
-  return rv;
-}
-
-/**
- * \defgroup pack protobuf_c_message_pack() implementation
- *
- * Routines mainly used by protobuf_c_message_pack().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
- * number of bytes written, which must be 5 or less.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t PaddleMobile__Framework__uint32_pack(uint32_t value,
-                                                          uint8_t *out) {
-  unsigned rv = 0;
-
-  if (value >= 0x80) {
-    out[rv++] = value | 0x80;
-    value >>= 7;
-    if (value >= 0x80) {
-      out[rv++] = value | 0x80;
-      value >>= 7;
-      if (value >= 0x80) {
-        out[rv++] = value | 0x80;
-        value >>= 7;
-        if (value >= 0x80) {
-          out[rv++] = value | 0x80;
-          value >>= 7;
-        }
-      }
-    }
-  }
-  /* assert: value<128 */
-  out[rv++] = value;
-  return rv;
-}
-
-/**
- * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
- * number of bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t PaddleMobile__Framework__uint64_pack(uint64_t value,
-                                                   uint8_t *out) {
-  uint32_t hi = (uint32_t)(value >> 32);
-  uint32_t lo = (uint32_t)value;
-  unsigned rv;
-
-  if (hi == 0) return PaddleMobile__Framework__uint32_pack((uint32_t)lo, out);
-  out[0] = (lo) | 0x80;
-  out[1] = (lo >> 7) | 0x80;
-  out[2] = (lo >> 14) | 0x80;
-  out[3] = (lo >> 21) | 0x80;
-  if (hi < 8) {
-    out[4] = (hi << 4) | (lo >> 28);
-    return 5;
-  } else {
-    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
-    hi >>= 3;
-  }
-  rv = 5;
-  while (hi >= 128) {
-    out[rv++] = hi | 0x80;
-    hi >>= 7;
-  }
-  out[rv++] = hi;
-  return rv;
-}
-
-/**
- * Pack a PaddleMobile__Framework__ProtobufCBinaryData and return the number of
- * bytes written. The output includes a length delimiter.
- *
- * \param bd
- *      PaddleMobile__Framework__ProtobufCBinaryData to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t PaddleMobile__Framework__binary_data_pack(
-    const PaddleMobile__Framework__ProtobufCBinaryData *bd, uint8_t *out) {
-  size_t len = bd->len;
-  size_t rv = PaddleMobile__Framework__uint32_pack(len, out);
-  memcpy(out + rv, bd->data, len);
-  return rv + len;
-}
-
-/**
- * Pack a field tag.
- *
- * Wire-type will be added in required_field_pack().
- *
- * \todo Just call PaddleMobile__Framework__uint64_pack on 64-bit platforms.
- *
- * \param id
- *      Tag value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t PaddleMobile__Framework__tag_pack(uint32_t id, uint8_t *out) {
-  if (id < (1UL << (32 - 3)))
-    return PaddleMobile__Framework__uint32_pack(id << 3, out);
-  else
-    return PaddleMobile__Framework__uint64_pack(((uint64_t)id) << 3, out);
-}
-
-/**
- * Given a field type, return the in-memory size.
- *
- * \todo Implement as a table lookup.
- *
- * \param type
- *      Field type.
- * \return
- *      Size of the field.
- */
-static inline size_t PaddleMobile__Framework__sizeof_elt_in_repeated_array(
-    PaddleMobile__Framework__ProtobufCType type) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-    case PROTOBUF_C_TYPE_ENUM:
-      return 4;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return sizeof(protobuf_c_boolean);
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      return sizeof(void *);
-    case PROTOBUF_C_TYPE_BYTES:
-      return sizeof(PaddleMobile__Framework__ProtobufCBinaryData);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-static inline int PaddleMobile__Framework__int_range_lookup(
-    unsigned n_ranges, const PaddleMobile__Framework__ProtobufCIntRange *ranges,
-    int value) {
-  unsigned n;
-  unsigned start;
-
-  if (n_ranges == 0) return -1;
-  start = 0;
-  n = n_ranges;
-  while (n > 1) {
-    unsigned mid = start + n / 2;
-
-    if (value < ranges[mid].start_value) {
-      n = mid - start;
-    } else if (value >=
-               ranges[mid].start_value +
-                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
-      unsigned new_start = mid + 1;
-      n = start + n - new_start;
-      start = new_start;
-    } else
-      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
-  }
-  if (n > 0) {
-    unsigned start_orig_index = ranges[start].orig_index;
-    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
-
-    if (ranges[start].start_value <= value &&
-        value < (int)(ranges[start].start_value + range_size)) {
-      return (value - ranges[start].start_value) + start_orig_index;
-    }
-  }
-  return -1;
-}
-
-static size_t PaddleMobile__Framework__parse_tag_and_wiretype(
-    size_t len, const uint8_t *data, uint32_t *tag_out,
-    PaddleMobile__Framework__ProtobufCWireType *wiretype_out) {
-  unsigned max_rv = len > 5 ? 5 : len;
-  uint32_t tag = (data[0] & 0x7f) >> 3;
-  unsigned shift = 4;
-  unsigned rv;
-
-  *wiretype_out = (PaddleMobile__Framework__ProtobufCWireType)(data[0] & 7);
-  if ((data[0] & 0x80) == 0) {
-    *tag_out = tag;
-    return 1;
-  }
-  for (rv = 1; rv < max_rv; rv++) {
-    if (data[rv] & 0x80) {
-      tag |= (data[rv] & 0x7f) << shift;
-      shift += 7;
-    } else {
-      tag |= data[rv] << shift;
-      *tag_out = tag;
-      return rv + 1;
-    }
-  }
-  return 0; /* error: bad header */
-}
-
-/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
-#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
-typedef struct PaddleMobile__Framework___ScannedMember ScannedMember;
-/** Field as it's being read. */
-struct PaddleMobile__Framework___ScannedMember {
-  uint32_t tag;              /**< Field tag. */
-  uint8_t wire_type;         /**< Field type. */
-  uint8_t length_prefix_len; /**< Prefix length. */
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor
-      *field;          /**< Field descriptor. */
-  size_t len;          /**< Field length. */
-  const uint8_t *data; /**< Pointer to field data. */
-};
-
-static inline uint32_t PaddleMobile__Framework__scan_length_prefixed_data(
-    size_t len, const uint8_t *data, size_t *prefix_len_out) {
-  unsigned hdr_max = len < 5 ? len : 5;
-  unsigned hdr_len;
-  uint32_t val = 0;
-  unsigned i;
-  unsigned shift = 0;
-
-  for (i = 0; i < hdr_max; i++) {
-    val |= (data[i] & 0x7f) << shift;
-    shift += 7;
-    if ((data[i] & 0x80) == 0) break;
-  }
-  if (i == hdr_max) {
-    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
-    return 0;
-  }
-  hdr_len = i + 1;
-  *prefix_len_out = hdr_len;
-  if (hdr_len + val > len) {
-    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
-    return 0;
-  }
-  return hdr_len + val;
-}
-
-static size_t PaddleMobile__Framework__max_b128_numbers(size_t len,
-                                                        const uint8_t *data) {
-  size_t rv = 0;
-  while (len--)
-    if ((*data++ & 0x80) == 0) ++rv;
-  return rv;
-}
-
-/**@}*/
-
-/**
- * Merge earlier message into a latter message.
- *
- * For numeric types and strings, if the same value appears multiple
- * times, the parser accepts the last value it sees. For embedded
- * message fields, the parser merges multiple instances of the same
- * field. That is, all singular scalar fields in the latter instance
- * replace those in the former, singular embedded messages are merged,
- * and repeated fields are concatenated.
- *
- * The earlier message should be freed after calling this function, as
- * some of its fields may have been reused and changed to their default
- * values during the merge.
- */
-static protobuf_c_boolean PaddleMobile__Framework__merge_messages(
-    PaddleMobile__Framework__ProtobufCMessage *earlier_msg,
-    PaddleMobile__Framework__ProtobufCMessage *latter_msg,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  unsigned i;
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *fields =
-      latter_msg->descriptor->fields;
-  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
-    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n_earlier =
-          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
-      uint8_t **p_earlier =
-          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
-      size_t *n_latter =
-          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
-      uint8_t **p_latter =
-          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
-
-      if (*n_earlier > 0) {
-        if (*n_latter > 0) {
-          /* Concatenate the repeated field */
-          size_t el_size =
-              PaddleMobile__Framework__sizeof_elt_in_repeated_array(
-                  fields[i].type);
-          uint8_t *new_field;
-
-          new_field = (uint8_t *)PaddleMobile__Framework__do_alloc(
-              allocator, (*n_earlier + *n_latter) * el_size);
-          if (!new_field) return FALSE;
-
-          memcpy(new_field, *p_earlier, *n_earlier * el_size);
-          memcpy(new_field + *n_earlier * el_size, *p_latter,
-                 *n_latter * el_size);
-
-          PaddleMobile__Framework__do_free(allocator, *p_latter);
-          PaddleMobile__Framework__do_free(allocator, *p_earlier);
-          *p_latter = new_field;
-          *n_latter = *n_earlier + *n_latter;
-        } else {
-          /* Zero copy the repeated field from the earlier message */
-          *n_latter = *n_earlier;
-          *p_latter = *p_earlier;
-        }
-        /* Make sure the field does not get double freed */
-        *n_earlier = 0;
-        *p_earlier = 0;
-      }
-    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
-               fields[i].label == PROTOBUF_C_LABEL_NONE) {
-      const PaddleMobile__Framework__ProtobufCFieldDescriptor *field;
-      uint32_t *earlier_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
-      uint32_t *latter_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
-      protobuf_c_boolean need_to_merge = FALSE;
-      void *earlier_elem;
-      void *latter_elem;
-      const void *def_val;
-
-      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
-        if (*latter_case_p == 0) {
-          /* lookup correct oneof field */
-          int field_index = PaddleMobile__Framework__int_range_lookup(
-              latter_msg->descriptor->n_field_ranges,
-              latter_msg->descriptor->field_ranges, *earlier_case_p);
-          field = latter_msg->descriptor->fields + field_index;
-        } else {
-          /* Oneof is present in the latter message, move on */
-          continue;
-        }
-      } else {
-        field = &fields[i];
-      }
-
-      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
-      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
-      def_val = field->default_value;
-
-      switch (field->type) {
-        case PROTOBUF_C_TYPE_MESSAGE: {
-          PaddleMobile__Framework__ProtobufCMessage *em =
-              *(PaddleMobile__Framework__ProtobufCMessage **)earlier_elem;
-          PaddleMobile__Framework__ProtobufCMessage *lm =
-              *(PaddleMobile__Framework__ProtobufCMessage **)latter_elem;
-          if (em != NULL) {
-            if (lm != NULL) {
-              if (!PaddleMobile__Framework__merge_messages(em, lm, allocator))
-                return FALSE;
-              /* Already merged */
-              need_to_merge = FALSE;
-            } else {
-              /* Zero copy the message */
-              need_to_merge = TRUE;
-            }
-          }
-          break;
-        }
-        case PROTOBUF_C_TYPE_BYTES: {
-          uint8_t *e_data =
-              ((PaddleMobile__Framework__ProtobufCBinaryData *)earlier_elem)
-                  ->data;
-          uint8_t *l_data =
-              ((PaddleMobile__Framework__ProtobufCBinaryData *)latter_elem)
-                  ->data;
-          const PaddleMobile__Framework__ProtobufCBinaryData *d_bd =
-              (PaddleMobile__Framework__ProtobufCBinaryData *)def_val;
-
-          need_to_merge =
-              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
-              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
-          break;
-        }
-        case PROTOBUF_C_TYPE_STRING: {
-          char *e_str = *(char **)earlier_elem;
-          char *l_str = *(char **)latter_elem;
-          const char *d_str = (const char *)def_val;
-
-          need_to_merge = e_str != d_str && l_str == d_str;
-          break;
-        }
-        default: {
-          /* Could be has field or case enum, the logic is
-           * equivalent, since 0 (FALSE) means not set for
-           * oneof */
-          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
-          break;
-        }
-      }
-
-      if (need_to_merge) {
-        size_t el_size =
-            PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-        memcpy(latter_elem, earlier_elem, el_size);
-        /*
-         * Reset the element from the old message to 0
-         * to make sure earlier message deallocation
-         * doesn't corrupt zero-copied data in the new
-         * message, earlier message will be freed after
-         * this function is called anyway
-         */
-        memset(earlier_elem, 0, el_size);
-
-        if (field->quantifier_offset != 0) {
-          /* Set the has field or the case enum,
-           * if applicable */
-          *latter_case_p = *earlier_case_p;
-          *earlier_case_p = 0;
-        }
-      }
-    }
-  }
-  return TRUE;
-}
-
-/**
- * Count packed elements.
- *
- * Given a raw slab of packed-repeated values, determine the number of
- * elements. This function detects certain kinds of errors but not
- * others; the remaining error checking is done by
- * PaddleMobile__Framework__parse_packed_repeated_member().
- */
-static protobuf_c_boolean PaddleMobile__Framework__count_packed_elements(
-    PaddleMobile__Framework__ProtobufCType type, size_t len,
-    const uint8_t *data, size_t *count_out) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (len % 4 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 4 for fixed-length 32-bit types");
-        return FALSE;
-      }
-      *count_out = len / 4;
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (len % 8 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 8 for fixed-length 64-bit types");
-        return FALSE;
-      }
-      *count_out = len / 8;
-      return TRUE;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      *count_out = PaddleMobile__Framework__max_b128_numbers(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *count_out = len;
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-    default:
-      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
-                              type);
-      return FALSE;
-  }
-}
-
-static inline uint32_t PaddleMobile__Framework__parse_uint32(
-    unsigned len, const uint8_t *data) {
-  uint32_t rv = data[0] & 0x7f;
-  if (len > 1) {
-    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
-    if (len > 2) {
-      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
-      if (len > 3) {
-        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
-        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
-      }
-    }
-  }
-  return rv;
-}
-
-static inline uint32_t PaddleMobile__Framework__parse_int32(
-    unsigned len, const uint8_t *data) {
-  return PaddleMobile__Framework__parse_uint32(len, data);
-}
-
-static inline int32_t unzigzag32(uint32_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint32_t PaddleMobile__Framework__parse_fixed_uint32(
-    const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint32_t t;
-  memcpy(&t, data, 4);
-  return t;
-#else
-  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
-         ((uint32_t)(data[3]) << 24);
-#endif
-}
-
-static uint64_t PaddleMobile__Framework__parse_uint64(unsigned len,
-                                                      const uint8_t *data) {
-  unsigned shift, i;
-  uint64_t rv;
-
-  if (len < 5) return PaddleMobile__Framework__parse_uint32(len, data);
-  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
-       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
-  shift = 28;
-  for (i = 4; i < len; i++) {
-    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
-    shift += 7;
-  }
-  return rv;
-}
-
-static inline int64_t PaddleMobile__Framework__unzigzag64(uint64_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint64_t PaddleMobile__Framework__parse_fixed_uint64(
-    const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint64_t t;
-  memcpy(&t, data, 8);
-  return t;
-#else
-  return (uint64_t)PaddleMobile__Framework__parse_fixed_uint32(data) |
-         (((uint64_t)PaddleMobile__Framework__parse_fixed_uint32(data + 4))
-          << 32);
-#endif
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_boolean(
-    unsigned len, const uint8_t *data) {
-  unsigned i;
-  for (i = 0; i < len; i++)
-    if (data[i] & 0x7f) return TRUE;
-  return FALSE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_required_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator,
-    protobuf_c_boolean maybe_clear) {
-  unsigned len = scanned_member->len;
-  const uint8_t *data = scanned_member->data;
-  PaddleMobile__Framework__ProtobufCWireType wire_type =
-      (PaddleMobile__Framework__ProtobufCWireType)scanned_member->wire_type;
-
-  switch (scanned_member->field->type) {
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = PaddleMobile__Framework__parse_int32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_UINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint32_t *)member = PaddleMobile__Framework__parse_uint32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member =
-          unzigzag32(PaddleMobile__Framework__parse_uint32(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
-      *(uint32_t *)member = PaddleMobile__Framework__parse_fixed_uint32(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint64_t *)member = PaddleMobile__Framework__parse_uint64(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int64_t *)member = PaddleMobile__Framework__unzigzag64(
-          PaddleMobile__Framework__parse_uint64(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
-      *(uint64_t *)member = PaddleMobile__Framework__parse_fixed_uint64(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *(protobuf_c_boolean *)member =
-          PaddleMobile__Framework__parse_boolean(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING: {
-      char **pstr = (char **)member;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      if (maybe_clear && *pstr != NULL) {
-        const char *def = (const char *)scanned_member->field->default_value;
-        if (*pstr != NULL && *pstr != def)
-          PaddleMobile__Framework__do_free(allocator, *pstr);
-      }
-      *pstr = (char *)PaddleMobile__Framework__do_alloc(allocator,
-                                                        len - pref_len + 1);
-      if (*pstr == NULL) return FALSE;
-      memcpy(*pstr, data + pref_len, len - pref_len);
-      (*pstr)[len - pref_len] = 0;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      PaddleMobile__Framework__ProtobufCBinaryData *bd =
-          (PaddleMobile__Framework__ProtobufCBinaryData *)member;
-      const PaddleMobile__Framework__ProtobufCBinaryData *def_bd;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_bd = (const PaddleMobile__Framework__ProtobufCBinaryData *)
-                   scanned_member->field->default_value;
-      if (maybe_clear && bd->data != NULL &&
-          (def_bd == NULL || bd->data != def_bd->data)) {
-        PaddleMobile__Framework__do_free(allocator, bd->data);
-      }
-      if (len - pref_len > 0) {
-        bd->data = (uint8_t *)PaddleMobile__Framework__do_alloc(allocator,
-                                                                len - pref_len);
-        if (bd->data == NULL) return FALSE;
-        memcpy(bd->data, data + pref_len, len - pref_len);
-      } else {
-        bd->data = NULL;
-      }
-      bd->len = len - pref_len;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      PaddleMobile__Framework__ProtobufCMessage **pmessage =
-          (PaddleMobile__Framework__ProtobufCMessage **)member;
-      PaddleMobile__Framework__ProtobufCMessage *subm;
-      const PaddleMobile__Framework__ProtobufCMessage *def_mess;
-      protobuf_c_boolean merge_successful = TRUE;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_mess = (const PaddleMobile__Framework__ProtobufCMessage *)
-                     scanned_member->field->default_value;
-      subm = PaddleMobile__Framework__protobuf_c_message_unpack(
-          (const PaddleMobile__Framework__ProtobufCMessageDescriptor *)
-              scanned_member->field->descriptor,
-          allocator, len - pref_len, data + pref_len);
-
-      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
-        if (subm != NULL)
-          merge_successful = PaddleMobile__Framework__merge_messages(
-              *pmessage, subm, allocator);
-        /* Delete the previous message */
-        PaddleMobile__Framework__protobuf_c_message_free_unpacked(*pmessage,
-                                                                  allocator);
-      }
-      *pmessage = subm;
-      if (subm == NULL || !merge_successful) return FALSE;
-      return TRUE;
-    }
-  }
-  return FALSE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_oneof_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
-      uint32_t, message, scanned_member->field->quantifier_offset);
-
-  /* If we have already parsed a member of this oneof, free it. */
-  if (*oneof_case != 0) {
-    /* lookup field */
-    int field_index = PaddleMobile__Framework__int_range_lookup(
-        message->descriptor->n_field_ranges, message->descriptor->field_ranges,
-        *oneof_case);
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *old_field =
-        message->descriptor->fields + field_index;
-    size_t el_size =
-        PaddleMobile__Framework__sizeof_elt_in_repeated_array(old_field->type);
-
-    switch (old_field->type) {
-      case PROTOBUF_C_TYPE_STRING: {
-        char **pstr = (char **)member;
-        const char *def = (const char *)old_field->default_value;
-        if (*pstr != NULL && *pstr != def)
-          PaddleMobile__Framework__do_free(allocator, *pstr);
-        break;
-      }
-      case PROTOBUF_C_TYPE_BYTES: {
-        PaddleMobile__Framework__ProtobufCBinaryData *bd =
-            (PaddleMobile__Framework__ProtobufCBinaryData *)member;
-        const PaddleMobile__Framework__ProtobufCBinaryData *def_bd =
-            (const PaddleMobile__Framework__ProtobufCBinaryData *)
-                old_field->default_value;
-        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
-          PaddleMobile__Framework__do_free(allocator, bd->data);
-        }
-        break;
-      }
-      case PROTOBUF_C_TYPE_MESSAGE: {
-        PaddleMobile__Framework__ProtobufCMessage **pmessage =
-            (PaddleMobile__Framework__ProtobufCMessage **)member;
-        const PaddleMobile__Framework__ProtobufCMessage *def_mess =
-            (const PaddleMobile__Framework__ProtobufCMessage *)
-                old_field->default_value;
-        if (*pmessage != NULL && *pmessage != def_mess)
-          PaddleMobile__Framework__protobuf_c_message_free_unpacked(*pmessage,
-                                                                    allocator);
-        break;
-      }
-      default:
-        break;
-    }
-
-    memset(member, 0, el_size);
-  }
-  if (!PaddleMobile__Framework__parse_required_member(scanned_member, member,
-                                                      allocator, TRUE))
-    return FALSE;
-
-  *oneof_case = scanned_member->tag;
-  return TRUE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_optional_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  if (!PaddleMobile__Framework__parse_required_member(scanned_member, member,
-                                                      allocator, TRUE))
-    return FALSE;
-  if (scanned_member->field->quantifier_offset != 0)
-    STRUCT_MEMBER(protobuf_c_boolean, message,
-                  scanned_member->field->quantifier_offset) = TRUE;
-  return TRUE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_repeated_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-      scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz =
-      PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-  char *array = *(char **)member;
-
-  if (!PaddleMobile__Framework__parse_required_member(
-          scanned_member, array + siz * (*p_n), allocator, FALSE)) {
-    return FALSE;
-  }
-  *p_n += 1;
-  return TRUE;
-}
-
-static unsigned PaddleMobile__Framework__scan_varint(unsigned len,
-                                                     const uint8_t *data) {
-  unsigned i;
-  if (len > 10) len = 10;
-  for (i = 0; i < len; i++)
-    if ((data[i] & 0x80) == 0) break;
-  if (i == len) return 0;
-  return i + 1;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_packed_repeated_member(
-    ScannedMember *scanned_member, void *member,
-    PaddleMobile__Framework__ProtobufCMessage *message) {
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-      scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz =
-      PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-  void *array = *(char **)member + siz * (*p_n);
-  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
-  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
-  size_t count = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint32_t *)array)[i] =
-            PaddleMobile__Framework__parse_fixed_uint32(at);
-        at += 4;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint64_t *)array)[i] =
-            PaddleMobile__Framework__parse_fixed_uint64(at);
-        at += 8;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] =
-            PaddleMobile__Framework__parse_int32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] =
-            unzigzag32(PaddleMobile__Framework__parse_uint32(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
-          return FALSE;
-        }
-        ((uint32_t *)array)[count++] =
-            PaddleMobile__Framework__parse_uint32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-
-    case PROTOBUF_C_TYPE_SINT64:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = PaddleMobile__Framework__unzigzag64(
-            PaddleMobile__Framework__parse_uint64(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      while (rem > 0) {
-        unsigned s = PaddleMobile__Framework__scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] =
-            PaddleMobile__Framework__parse_uint64(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      count = rem;
-      for (i = 0; i < count; i++) {
-        if (at[i] > 1) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
-          return FALSE;
-        }
-        ((protobuf_c_boolean *)array)[i] = at[i];
-      }
-      break;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  *p_n += count;
-  return TRUE;
-
-#if !defined(WORDS_BIGENDIAN)
-no_unpacking_needed:
-  memcpy(array, at, count * siz);
-  *p_n += count;
-  return TRUE;
-#endif
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__is_packable_type(
-    PaddleMobile__Framework__ProtobufCType type) {
-  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
-         type != PROTOBUF_C_TYPE_MESSAGE;
-}
-
-static protobuf_c_boolean PaddleMobile__Framework__parse_member(
-    ScannedMember *scanned_member,
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-      scanned_member->field;
-  void *member;
-
-  if (field == NULL) {
-    PaddleMobile__Framework__ProtobufCMessageUnknownField *ufield =
-        message->unknown_fields + (message->n_unknown_fields++);
-    ufield->tag = scanned_member->tag;
-    ufield->wire_type =
-        (PaddleMobile__Framework__ProtobufCWireType)scanned_member->wire_type;
-    ufield->len = scanned_member->len;
-    ufield->data = (uint8_t *)PaddleMobile__Framework__do_alloc(
-        allocator, scanned_member->len);
-    if (ufield->data == NULL) return FALSE;
-    memcpy(ufield->data, scanned_member->data, ufield->len);
-    return TRUE;
-  }
-  member = (char *)message + field->offset;
-  switch (field->label) {
-    case PROTOBUF_C_LABEL_REQUIRED:
-      return PaddleMobile__Framework__parse_required_member(
-          scanned_member, member, allocator, TRUE);
-    case PROTOBUF_C_LABEL_OPTIONAL:
-    case PROTOBUF_C_LABEL_NONE:
-      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
-        return PaddleMobile__Framework__parse_oneof_member(
-            scanned_member, member, message, allocator);
-      } else {
-        return PaddleMobile__Framework__parse_optional_member(
-            scanned_member, member, message, allocator);
-      }
-    case PROTOBUF_C_LABEL_REPEATED:
-      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           PaddleMobile__Framework__is_packable_type(field->type))) {
-        return PaddleMobile__Framework__parse_packed_repeated_member(
-            scanned_member, member, message);
-      } else {
-        return PaddleMobile__Framework__parse_repeated_member(
-            scanned_member, member, message, allocator);
-      }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Initialise messages generated by old code.
- *
- * This function is used if desc->message_init == NULL (which occurs
- * for old code, and which would be useful to support allocating
- * descriptors dynamically).
- */
-static void PaddleMobile__Framework__message_init_generic(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc,
-    PaddleMobile__Framework__ProtobufCMessage *message) {
-  unsigned i;
-
-  memset(message, 0, desc->sizeof_message);
-  message->descriptor = desc;
-  for (i = 0; i < desc->n_fields; i++) {
-    if (desc->fields[i].default_value != NULL &&
-        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
-      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
-      const void *dv = desc->fields[i].default_value;
-
-      switch (desc->fields[i].type) {
-        case PROTOBUF_C_TYPE_INT32:
-        case PROTOBUF_C_TYPE_SINT32:
-        case PROTOBUF_C_TYPE_SFIXED32:
-        case PROTOBUF_C_TYPE_UINT32:
-        case PROTOBUF_C_TYPE_FIXED32:
-        case PROTOBUF_C_TYPE_FLOAT:
-        case PROTOBUF_C_TYPE_ENUM:
-          memcpy(field, dv, 4);
-          break;
-        case PROTOBUF_C_TYPE_INT64:
-        case PROTOBUF_C_TYPE_SINT64:
-        case PROTOBUF_C_TYPE_SFIXED64:
-        case PROTOBUF_C_TYPE_UINT64:
-        case PROTOBUF_C_TYPE_FIXED64:
-        case PROTOBUF_C_TYPE_DOUBLE:
-          memcpy(field, dv, 8);
-          break;
-        case PROTOBUF_C_TYPE_BOOL:
-          memcpy(field, dv, sizeof(protobuf_c_boolean));
-          break;
-        case PROTOBUF_C_TYPE_BYTES:
-          memcpy(field, dv,
-                 sizeof(PaddleMobile__Framework__ProtobufCBinaryData));
-          break;
-
-        case PROTOBUF_C_TYPE_STRING:
-        case PROTOBUF_C_TYPE_MESSAGE:
-          /*
-           * The next line essentially implements a cast
-           * from const, which is totally unavoidable.
-           */
-          *(const void **)field = dv;
-          break;
-      }
-    }
-  }
-}
-
-/**@}*/
-
-/*
- * ScannedMember slabs (an unpacking implementation detail). Before doing real
- * unpacking, we first scan through the elements to see how many there are (for
- * repeated fields), and which field to use (for non-repeated fields given
- * twice).
- *
- * In order to avoid allocations for small messages, we keep a stack-allocated
- * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
- * fill that up, we allocate each slab twice as large as the previous one.
- */
-#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
-
-/*
- * The number of slabs, including the stack-allocated ones; choose the number so
- * that we would overflow if we needed a slab larger than provided.
- */
-#define MAX_SCANNED_MEMBER_SLAB                                      \
-  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
-   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
-
-#define REQUIRED_FIELD_BITMAP_SET(index) \
-  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
-
-#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
-  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
-
-PaddleMobile__Framework__ProtobufCMessage *
-PaddleMobile__Framework__protobuf_c_message_unpack(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data) {
-  PaddleMobile__Framework__ProtobufCMessage *rv;
-  size_t rem = len;
-  const uint8_t *at = data;
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *last_field =
-      desc->fields + 0;
-  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
-
-  /*
-   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
-   * The first slab (scanned_member_slabs[0] is just a pointer to
-   * first_member_slab), above. All subsequent slabs will be allocated
-   * using the allocator.
-   */
-  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
-  unsigned which_slab = 0;    /* the slab we are currently populating */
-  unsigned in_slab_index = 0; /* number of members in the slab */
-  size_t n_unknown = 0;
-  unsigned f;
-  unsigned j;
-  unsigned i_slab;
-  unsigned last_field_index = 0;
-  unsigned required_fields_bitmap_len;
-  unsigned char required_fields_bitmap_stack[16];
-  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
-  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
-
-  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-
-  rv = (PaddleMobile__Framework__ProtobufCMessage *)
-      PaddleMobile__Framework__do_alloc(allocator, desc->sizeof_message);
-  if (!rv) return (NULL);
-  scanned_member_slabs[0] = first_member_slab;
-
-  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
-  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
-    required_fields_bitmap = (unsigned char *)PaddleMobile__Framework__do_alloc(
-        allocator, required_fields_bitmap_len);
-    if (!required_fields_bitmap) {
-      PaddleMobile__Framework__do_free(allocator, rv);
-      return (NULL);
-    }
-    required_fields_bitmap_alloced = TRUE;
-  }
-  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
-
-  /*
-   * Generated code always defines "message_init". However, we provide a
-   * fallback for (1) users of old protobuf-c generated-code that do not
-   * provide the function, and (2) descriptors constructed from some other
-   * source (most likely, direct construction from the .proto file).
-   */
-  if (desc->message_init != NULL)
-    PaddleMobile__Framework__protobuf_c_message_init(desc, rv);
-  else
-    PaddleMobile__Framework__message_init_generic(desc, rv);
-
-  while (rem > 0) {
-    uint32_t tag;
-    PaddleMobile__Framework__ProtobufCWireType wire_type;
-    size_t used = PaddleMobile__Framework__parse_tag_and_wiretype(rem, at, &tag,
-                                                                  &wire_type);
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field;
-    ScannedMember tmp;
-
-    if (used == 0) {
-      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
-                              (unsigned)(at - data));
-      goto error_cleanup_during_scan;
-    }
-    /*
-     * \todo Consider optimizing for field[1].id == tag, if field[1]
-     * exists!
-     */
-    if (last_field == NULL || last_field->id != tag) {
-      /* lookup field */
-      int field_index = PaddleMobile__Framework__int_range_lookup(
-          desc->n_field_ranges, desc->field_ranges, tag);
-      if (field_index < 0) {
-        field = NULL;
-        n_unknown++;
-      } else {
-        field = desc->fields + field_index;
-        last_field = field;
-        last_field_index = field_index;
-      }
-    } else {
-      field = last_field;
-    }
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
-      REQUIRED_FIELD_BITMAP_SET(last_field_index);
-
-    at += used;
-    rem -= used;
-    tmp.tag = tag;
-    tmp.wire_type = wire_type;
-    tmp.field = field;
-    tmp.data = at;
-    tmp.length_prefix_len = 0;
-
-    switch (wire_type) {
-      case PROTOBUF_C_WIRE_TYPE_VARINT: {
-        unsigned max_len = rem < 10 ? rem : 10;
-        unsigned i;
-
-        for (i = 0; i < max_len; i++)
-          if ((at[i] & 0x80) == 0) break;
-        if (i == max_len) {
-          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = i + 1;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_64BIT:
-        if (rem < 8) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 8;
-        break;
-      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
-        size_t pref_len;
-
-        tmp.len = PaddleMobile__Framework__scan_length_prefixed_data(rem, at,
-                                                                     &pref_len);
-        if (tmp.len == 0) {
-          /* NOTE: PaddleMobile__Framework__scan_length_prefixed_data calls
-           * UNPACK_ERROR */
-          goto error_cleanup_during_scan;
-        }
-        tmp.length_prefix_len = pref_len;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_32BIT:
-        if (rem < 4) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 4;
-        break;
-      default:
-        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
-                                (unsigned)(at - data));
-        goto error_cleanup_during_scan;
-    }
-
-    if (in_slab_index ==
-        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
-      size_t size;
-
-      in_slab_index = 0;
-      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
-        PROTOBUF_C_UNPACK_ERROR("too many fields");
-        goto error_cleanup_during_scan;
-      }
-      which_slab++;
-      size = sizeof(ScannedMember)
-             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
-      scanned_member_slabs[which_slab] =
-          (ScannedMember *)PaddleMobile__Framework__do_alloc(allocator, size);
-      if (scanned_member_slabs[which_slab] == NULL)
-        goto error_cleanup_during_scan;
-    }
-    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           PaddleMobile__Framework__is_packable_type(field->type))) {
-        size_t count;
-        if (!PaddleMobile__Framework__count_packed_elements(
-                field->type, tmp.len - tmp.length_prefix_len,
-                tmp.data + tmp.length_prefix_len, &count)) {
-          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
-          goto error_cleanup_during_scan;
-        }
-        *n += count;
-      } else {
-        *n += 1;
-      }
-    }
-
-    at += tmp.len;
-    rem -= tmp.len;
-  }
-
-  /* allocate space for repeated fields, also check that all required fields
-   * have been set */
-  for (f = 0; f < desc->n_fields; f++) {
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *field =
-        desc->fields + f;
-    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t siz =
-          PaddleMobile__Framework__sizeof_elt_in_repeated_array(field->type);
-      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (*n_ptr != 0) {
-        unsigned n = *n_ptr;
-        void *a;
-        *n_ptr = 0;
-        assert(rv->descriptor != NULL);
-#define CLEAR_REMAINING_N_PTRS()                               \
-  for (f++; f < desc->n_fields; f++) {                         \
-    field = desc->fields + f;                                  \
-    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
-      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
-  }
-        a = PaddleMobile__Framework__do_alloc(allocator, siz * n);
-        if (!a) {
-          CLEAR_REMAINING_N_PTRS();
-          goto error_cleanup;
-        }
-        STRUCT_MEMBER(void *, rv, field->offset) = a;
-      }
-    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
-        CLEAR_REMAINING_N_PTRS();
-        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
-                                desc->name, field->name);
-        goto error_cleanup;
-      }
-    }
-  }
-#undef CLEAR_REMAINING_N_PTRS
-
-  /* allocate space for unknown fields */
-  if (n_unknown) {
-    rv->unknown_fields =
-        (PaddleMobile__Framework__ProtobufCMessageUnknownField *)
-            PaddleMobile__Framework__do_alloc(
-                allocator,
-                n_unknown *
-                    sizeof(
-                        PaddleMobile__Framework__ProtobufCMessageUnknownField));
-    if (rv->unknown_fields == NULL) goto error_cleanup;
-  }
-
-  /* do real parsing */
-  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
-    unsigned max =
-        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
-    ScannedMember *slab = scanned_member_slabs[i_slab];
-
-    for (j = 0; j < max; j++) {
-      if (!PaddleMobile__Framework__parse_member(slab + j, rv, allocator)) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "error parsing member %s of %s",
-            slab->field ? slab->field->name : "*unknown-field*", desc->name);
-        goto error_cleanup;
-      }
-    }
-  }
-
-  /* cleanup */
-  for (j = 1; j <= which_slab; j++)
-    PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    PaddleMobile__Framework__do_free(allocator, required_fields_bitmap);
-  return rv;
-
-error_cleanup:
-  PaddleMobile__Framework__protobuf_c_message_free_unpacked(rv, allocator);
-  for (j = 1; j <= which_slab; j++)
-    PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    PaddleMobile__Framework__do_free(allocator, required_fields_bitmap);
-  return NULL;
-
-error_cleanup_during_scan:
-  PaddleMobile__Framework__do_free(allocator, rv);
-  for (j = 1; j <= which_slab; j++)
-    PaddleMobile__Framework__do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    PaddleMobile__Framework__do_free(allocator, required_fields_bitmap);
-  return NULL;
-}
-
-void PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator) {
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *desc;
-  unsigned f;
-
-  if (message == NULL) return;
-
-  desc = message->descriptor;
-
-  ASSERT_IS_MESSAGE(message);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-  message->descriptor = NULL;
-  for (f = 0; f < desc->n_fields; f++) {
-    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
-        desc->fields[f].id !=
-            STRUCT_MEMBER(uint32_t, message,
-                          desc->fields[f].quantifier_offset)) {
-      /* This is not the selected oneof, skip it */
-      continue;
-    }
-
-    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t n =
-          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
-      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
-
-      if (arr != NULL) {
-        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            PaddleMobile__Framework__do_free(allocator, ((char **)arr)[i]);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            PaddleMobile__Framework__do_free(
-                allocator,
-                ((PaddleMobile__Framework__ProtobufCBinaryData *)arr)[i].data);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-                ((PaddleMobile__Framework__ProtobufCMessage **)arr)[i],
-                allocator);
-        }
-        PaddleMobile__Framework__do_free(allocator, arr);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
-
-      if (str && str != desc->fields[f].default_value)
-        PaddleMobile__Framework__do_free(allocator, str);
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-      void *data = STRUCT_MEMBER(PaddleMobile__Framework__ProtobufCBinaryData,
-                                 message, desc->fields[f].offset)
-                       .data;
-      const PaddleMobile__Framework__ProtobufCBinaryData *default_bd;
-
-      default_bd =
-          (const PaddleMobile__Framework__ProtobufCBinaryData *)desc->fields[f]
-              .default_value;
-      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
-        PaddleMobile__Framework__do_free(allocator, data);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-      PaddleMobile__Framework__ProtobufCMessage *sm;
-
-      sm = STRUCT_MEMBER(PaddleMobile__Framework__ProtobufCMessage *, message,
-                         desc->fields[f].offset);
-      if (sm && sm != desc->fields[f].default_value)
-        PaddleMobile__Framework__protobuf_c_message_free_unpacked(sm,
-                                                                  allocator);
-    }
-  }
-
-  for (f = 0; f < message->n_unknown_fields; f++)
-    PaddleMobile__Framework__do_free(allocator,
-                                     message->unknown_fields[f].data);
-  if (message->unknown_fields != NULL)
-    PaddleMobile__Framework__do_free(allocator, message->unknown_fields);
-
-  PaddleMobile__Framework__do_free(allocator, message);
-}
-
-void PaddleMobile__Framework__protobuf_c_message_init(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor,
-    void *message) {
-  descriptor->message_init(
-      (PaddleMobile__Framework__ProtobufCMessage *)(message));
-}
-
-protobuf_c_boolean PaddleMobile__Framework__protobuf_c_message_check(
-    const PaddleMobile__Framework__ProtobufCMessage *message) {
-  unsigned i;
-
-  if (!message || !message->descriptor ||
-      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
-    return FALSE;
-  }
-
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const PaddleMobile__Framework__ProtobufCFieldDescriptor *f =
-        message->descriptor->fields + i;
-    PaddleMobile__Framework__ProtobufCType type = f->type;
-    PaddleMobile__Framework__ProtobufCLabel label = f->label;
-    void *field = STRUCT_MEMBER_P(message, f->offset);
-
-    if (label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *quantity =
-          (size_t *)STRUCT_MEMBER_P(message, f->quantifier_offset);
-
-      if (*quantity > 0 && *(void **)field == NULL) {
-        return FALSE;
-      }
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        PaddleMobile__Framework__ProtobufCMessage **submessage =
-            *(PaddleMobile__Framework__ProtobufCMessage ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!PaddleMobile__Framework__protobuf_c_message_check(submessage[j]))
-            return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char **string = *(char ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!string[j]) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        PaddleMobile__Framework__ProtobufCBinaryData *bd =
-            *(PaddleMobile__Framework__ProtobufCBinaryData **)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
-        }
-      }
-
-    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        PaddleMobile__Framework__ProtobufCMessage *submessage =
-            *(PaddleMobile__Framework__ProtobufCMessage **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
-          if (!PaddleMobile__Framework__protobuf_c_message_check(submessage))
-            return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char *string = *(char **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        protobuf_c_boolean *has = (protobuf_c_boolean *)STRUCT_MEMBER_P(
-            message, f->quantifier_offset);
-        PaddleMobile__Framework__ProtobufCBinaryData *bd =
-            (PaddleMobile__Framework__ProtobufCBinaryData *)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
-          if (bd->len > 0 && bd->data == NULL) return FALSE;
-        }
-      }
-    }
-  }
-
-  return TRUE;
-}
-
-/* === services === */
-
-typedef void (*GenericHandler)(
-    void *service, const PaddleMobile__Framework__ProtobufCMessage *input,
-    ProtobufCClosure closure, void *closure_data);
diff --git a/mobile/src/protobuf-c/protobuf-c.h b/mobile/src/protobuf-c/protobuf-c.h
deleted file mode 100644
index ffb86e86121b685b316b21d0be0e16ef8c836966..0000000000000000000000000000000000000000
--- a/mobile/src/protobuf-c/protobuf-c.h
+++ /dev/null
@@ -1,962 +0,0 @@
-/*
- * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * \mainpage Introduction
- *
- * This is [protobuf-c], a C implementation of [Protocol Buffers].
- *
- * This file defines the public API for the `libprotobuf-c` support library.
- * This API includes interfaces that can be used directly by client code as well
- * as the interfaces used by the code generated by the `protoc-c` compiler.
- *
- * The `libprotobuf-c` support library performs the actual serialization and
- * deserialization of Protocol Buffers messages. It interacts with structures,
- * definitions, and metadata generated by the `protoc-c` compiler from .proto
- * files.
- *
- * \authors Dave Benson and the `protobuf-c` authors.
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- *
- * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
- * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
- * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
- *
- * \page gencode Generated Code
- *
- * For each enum, we generate a C enum. For each message, we generate a C
- * structure which can be cast to a `PaddleMobile__Framework__ProtobufCMessage`.
- *
- * For each enum and message, we generate a descriptor object that allows us to
- * implement a kind of reflection on the structures.
- *
- * First, some naming conventions:
- *
- * - The name of the type for enums and messages and services is camel case
- *   (meaning WordsAreCrammedTogether) except that double underscores are used
- *   to delimit scopes. For example, the following `.proto` file:
- *
-~~~{.proto}
-        package foo.bar;
-        message BazBah {
-            optional int32 val = 1;
-        }
-~~~
- *
- * would generate a C type `Foo__Bar__BazBah`.
- *
- * - Identifiers for functions and globals are all lowercase, with camel case
- *   words separated by single underscores. For example, one of the function
- *   prototypes generated by `protoc-c` for the above example:
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (PaddleMobile__Framework__ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - Identifiers for enum values contain an uppercase prefix which embeds the
- *   package name and the enum type name.
- *
- * - A double underscore is used to separate further components of identifier
- *   names.
- *
- * For example, in the name of the unpack function above, the package name
- * `foo.bar` has become `foo__bar`, the message name BazBah has become
- * `baz_bah`, and the method name is `unpack`. These are all joined with double
- * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
- *
- * We also generate descriptor objects for messages and enums. These are
- * declared in the `.pb-c.h` files:
- *
-~~~{.c}
-extern const PaddleMobile__Framework__ProtobufCMessageDescriptor
-foo__bar__baz_bah__descriptor;
-~~~
- *
- * The message structures all begin with
-`PaddleMobile__Framework__ProtobufCMessageDescriptor *` which is
- * sufficient to allow them to be cast to
-`PaddleMobile__Framework__ProtobufCMessage`.
- *
- * For each message defined in a `.proto` file, we generate a number of
- * functions and macros. Each function name contains a prefix based on the
- * package name and message name in order to make it a unique C identifier.
- *
- * - `INIT`. Statically initializes a message object, initializing its
- *   descriptor and setting its fields to default values. Uninitialized
- *   messages cannot be processed by the protobuf-c library.
- *
-~~~{.c}
-#define FOO__BAR__BAZ_BAH__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
-~~~
- * - `init()`. Initializes a message object, initializing its descriptor and
- *   setting its fields to default values. Uninitialized messages cannot be
- *   processed by the protobuf-c library.
- *
-~~~{.c}
-void foo__bar__baz_bah__init
-                     (Foo__Bar__BazBah *message);
-~~~
- * - `unpack()`. Unpacks data for a particular message format. Note that the
- *   `allocator` parameter is usually `NULL` to indicate that the system's
- *   `malloc()` and `free()` functions should be used for dynamically allocating
- *   memory.
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (PaddleMobile__Framework__ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
- *   method. Freeing `NULL` is allowed (the same as with `free()`).
- *
-~~~{.c}
-void   foo__bar__baz_bah__free_unpacked
-                     (Foo__Bar__BazBah *message,
-                      PaddleMobile__Framework__ProtobufCAllocator *allocator);
-~~~
- *
- * - `get_packed_size()`. Calculates the length in bytes of the serialized
- *   representation of the message object.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__get_packed_size
-                     (const Foo__Bar__BazBah   *message);
-~~~
- *
- * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
- *   the buffer is large enough. (Use `get_packed_size()` first.)
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack
-                     (const Foo__Bar__BazBah   *message,
-                      uint8_t             *out);
-~~~
- *
- * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
- *   object which defines an "append bytes" callback to consume data as it is
- *   serialized.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack_to_buffer
-                     (const Foo__Bar__BazBah   *message,
-                      PaddleMobile__Framework__ProtobufCBuffer     *buffer);
-~~~
- *
- * \page pack Packing and unpacking messages
- *
- * To pack a message, first compute the packed size of the message with
- * PaddleMobile__Framework__protobuf_c_message_get_packed_size(), then allocate
-a buffer of at least
- * that size, then call protobuf_c_message_pack().
- *
- * Alternatively, a message can be serialized without calculating the final size
- * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
- * PaddleMobile__Framework__ProtobufCBuffer object which implements an "append"
-method that consumes
- * data.
- *
- * To unpack a message, call the
-PaddleMobile__Framework__protobuf_c_message_unpack() function. The
- * result can be cast to an object of the type that matches the descriptor for
- * the message.
- *
- * The result of unpacking a message should be freed with
- * PaddleMobile__Framework__protobuf_c_message_free_unpacked().
- */
-
-#ifndef PROTOBUF_C_H
-#define PROTOBUF_C_H
-
-#include <assert.h>
-#include <limits.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-#define PROTOBUF_C__BEGIN_DECLS extern "C" {
-#define PROTOBUF_C__END_DECLS }
-#else
-#define PROTOBUF_C__BEGIN_DECLS
-#define PROTOBUF_C__END_DECLS
-#endif
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
-#ifdef PROTOBUF_C_EXPORT
-#define PROTOBUF_C__API __declspec(dllexport)
-#else
-#define PROTOBUF_C__API __declspec(dllimport)
-#endif
-#else
-#define PROTOBUF_C__API
-#endif
-
-#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
-    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
-#else
-#define PROTOBUF_C__DEPRECATED
-#endif
-
-#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
-#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
-  , _##enum_name##_IS_INT_SIZE = INT_MAX
-#endif
-
-#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
-#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
-#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
-
-/* Empty string used for initializers */
-extern const char PaddleMobile__Framework__protobuf_c_empty_string[];
-
-/**
- * \defgroup api Public API
- *
- * This is the public API for `libprotobuf-c`. These interfaces are stable and
- * subject to Semantic Versioning guarantees.
- *
- * @{
- */
-
-/**
- * Values for the `flags` word in
- * `PaddleMobile__Framework__ProtobufCFieldDescriptor`.
- */
-typedef enum {
-  /** Set if the field is repeated and marked with the `packed` option. */
-  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
-
-  /** Set if the field is marked with the `deprecated` option. */
-  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
-
-  /** Set if the field is a member of a oneof (union). */
-  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
-} PaddleMobile__Framework__ProtobufCFieldFlag;
-
-/**
- * Message field rules.
- *
- * \see [Defining A Message Type] in the Protocol Buffers documentation.
- *
- * [Defining A Message Type]:
- *      https://developers.google.com/protocol-buffers/docs/proto#simple
- */
-typedef enum {
-  /** A well-formed message must have exactly one of this field. */
-  PROTOBUF_C_LABEL_REQUIRED,
-
-  /**
-   * A well-formed message can have zero or one of this field (but not
-   * more than one).
-   */
-  PROTOBUF_C_LABEL_OPTIONAL,
-
-  /**
-   * This field can be repeated any number of times (including zero) in a
-   * well-formed message. The order of the repeated values will be
-   * preserved.
-   */
-  PROTOBUF_C_LABEL_REPEATED,
-
-  /**
-   * This field has no label. This is valid only in proto3 and is
-   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
-   */
-  PROTOBUF_C_LABEL_NONE,
-} PaddleMobile__Framework__ProtobufCLabel;
-
-/**
- * Field value types.
- *
- * \see [Scalar Value Types] in the Protocol Buffers documentation.
- *
- * [Scalar Value Types]:
- *      https://developers.google.com/protocol-buffers/docs/proto#scalar
- */
-typedef enum {
-  PROTOBUF_C_TYPE_INT32,    /**< int32 */
-  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
-  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
-  PROTOBUF_C_TYPE_INT64,    /**< int64 */
-  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
-  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
-  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
-  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
-  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
-  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
-  PROTOBUF_C_TYPE_FLOAT,    /**< float */
-  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
-  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
-  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
-  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
-  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
-  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
-} PaddleMobile__Framework__ProtobufCType;
-
-/**
- * Field wire types.
- *
- * \see [Message Structure] in the Protocol Buffers documentation.
- *
- * [Message Structure]:
- *      https://developers.google.com/protocol-buffers/docs/encoding#structure
- */
-typedef enum {
-  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
-  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
-  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
-  /* "Start group" and "end group" wire types are unsupported. */
-  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
-} PaddleMobile__Framework__ProtobufCWireType;
-
-struct PaddleMobile__Framework__ProtobufCAllocator;
-struct PaddleMobile__Framework__ProtobufCBinaryData;
-struct PaddleMobile__Framework__ProtobufCBuffer;
-struct PaddleMobile__Framework__ProtobufCBufferSimple;
-struct PaddleMobile__Framework__ProtobufCEnumDescriptor;
-struct PaddleMobile__Framework__ProtobufCEnumValue;
-struct PaddleMobile__Framework__ProtobufCEnumValueIndex;
-struct PaddleMobile__Framework__ProtobufCFieldDescriptor;
-struct PaddleMobile__Framework__ProtobufCIntRange;
-struct PaddleMobile__Framework__ProtobufCMessage;
-struct PaddleMobile__Framework__ProtobufCMessageDescriptor;
-struct PaddleMobile__Framework__ProtobufCMessageUnknownField;
-struct PaddleMobile__Framework__ProtobufCMethodDescriptor;
-struct PaddleMobile__Framework__ProtobufCService;
-struct PaddleMobile__Framework__ProtobufCServiceDescriptor;
-
-typedef struct PaddleMobile__Framework__ProtobufCAllocator
-    PaddleMobile__Framework__ProtobufCAllocator;
-typedef struct PaddleMobile__Framework__ProtobufCBinaryData
-    PaddleMobile__Framework__ProtobufCBinaryData;
-typedef struct PaddleMobile__Framework__ProtobufCBuffer
-    PaddleMobile__Framework__ProtobufCBuffer;
-typedef struct PaddleMobile__Framework__ProtobufCBufferSimple
-    PaddleMobile__Framework__ProtobufCBufferSimple;
-typedef struct PaddleMobile__Framework__ProtobufCEnumDescriptor
-    PaddleMobile__Framework__ProtobufCEnumDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCEnumValue
-    PaddleMobile__Framework__ProtobufCEnumValue;
-typedef struct PaddleMobile__Framework__ProtobufCEnumValueIndex
-    PaddleMobile__Framework__ProtobufCEnumValueIndex;
-typedef struct PaddleMobile__Framework__ProtobufCFieldDescriptor
-    PaddleMobile__Framework__ProtobufCFieldDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCIntRange
-    PaddleMobile__Framework__ProtobufCIntRange;
-typedef struct PaddleMobile__Framework__ProtobufCMessage
-    PaddleMobile__Framework__ProtobufCMessage;
-typedef struct PaddleMobile__Framework__ProtobufCMessageDescriptor
-    PaddleMobile__Framework__ProtobufCMessageDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCMessageUnknownField
-    PaddleMobile__Framework__ProtobufCMessageUnknownField;
-typedef struct PaddleMobile__Framework__ProtobufCMethodDescriptor
-    PaddleMobile__Framework__ProtobufCMethodDescriptor;
-typedef struct PaddleMobile__Framework__ProtobufCService
-    PaddleMobile__Framework__ProtobufCService;
-typedef struct PaddleMobile__Framework__ProtobufCServiceDescriptor
-    PaddleMobile__Framework__ProtobufCServiceDescriptor;
-
-/** Boolean type. */
-typedef int protobuf_c_boolean;
-
-typedef void (*ProtobufCClosure)(
-    const PaddleMobile__Framework__ProtobufCMessage *, void *closure_data);
-typedef void (*ProtobufCMessageInit)(
-    PaddleMobile__Framework__ProtobufCMessage *);
-typedef void (*ProtobufCServiceDestroy)(
-    PaddleMobile__Framework__ProtobufCService *);
-
-/**
- * Structure for defining a custom memory allocator.
- */
-struct PaddleMobile__Framework__ProtobufCAllocator {
-  /** Function to allocate memory. */
-  void *(*alloc)(void *allocator_data, size_t size);
-
-  /** Function to free memory. */
-  void (*free)(void *allocator_data, void *pointer);
-
-  /** Opaque pointer passed to `alloc` and `free` functions. */
-  void *allocator_data;
-};
-
-/**
- * Structure for the protobuf `bytes` scalar type.
- *
- * The data contained in a `PaddleMobile__Framework__ProtobufCBinaryData` is an
- * arbitrary sequence of bytes. It may contain embedded `NUL` characters and is
- * not required to be `NUL`-terminated.
- */
-struct PaddleMobile__Framework__ProtobufCBinaryData {
-  size_t len;    /**< Number of bytes in the `data` field. */
-  uint8_t *data; /**< Data bytes. */
-};
-
-/**
- * Structure for defining a virtual append-only buffer. Used by
- * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
- * bytes.
- *
- * `PaddleMobile__Framework__ProtobufCBuffer` "subclasses" may be defined on the
-stack. For example, to
- * write to a `FILE` object:
- *
-~~~{.c}
-typedef struct {
-        PaddleMobile__Framework__ProtobufCBuffer base;
-        FILE *fp;
-} BufferAppendToFile;
-
-static void
-my_buffer_file_append(PaddleMobile__Framework__ProtobufCBuffer *buffer,
-                      size_t len,
-                      const uint8_t *data)
-{
-        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
-        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
-}
-~~~
- *
- * To use this new type of PaddleMobile__Framework__ProtobufCBuffer, it could be
-called as follows:
- *
-~~~{.c}
-...
-BufferAppendToFile tmp = {0};
-tmp.base.append = my_buffer_file_append;
-tmp.fp = fp;
-protobuf_c_message_pack_to_buffer(&message, &tmp);
-...
-~~~
- */
-struct PaddleMobile__Framework__ProtobufCBuffer {
-  /** Append function. Consumes the `len` bytes stored at `data`. */
-  void (*append)(PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len,
-                 const uint8_t *data);
-};
-
-/**
- * Simple buffer "subclass" of `PaddleMobile__Framework__ProtobufCBuffer`.
- *
- * A `PaddleMobile__Framework__ProtobufCBufferSimple` object is declared on the
-stack and uses a
- * scratch buffer provided by the user for the initial allocation. It performs
- * exponential resizing, using dynamically allocated memory. A
- * `PaddleMobile__Framework__ProtobufCBufferSimple` object can be created and
-used as follows:
- *
-~~~{.c}
-uint8_t pad[128];
-PaddleMobile__Framework__ProtobufCBufferSimple simple =
-PROTOBUF_C_BUFFER_SIMPLE_INIT(pad); PaddleMobile__Framework__ProtobufCBuffer
-*buffer = (PaddleMobile__Framework__ProtobufCBuffer *) &simple;
-~~~
- *
- * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
- * message has been serialized to a
-`PaddleMobile__Framework__ProtobufCBufferSimple` object, the
- * serialized data bytes can be accessed from the `.data` field.
- *
- * To free the memory allocated by a
-`PaddleMobile__Framework__ProtobufCBufferSimple` object, if any,
- * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
- *
-~~~{.c}
-PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
-~~~
- *
- * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
- * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
- */
-struct PaddleMobile__Framework__ProtobufCBufferSimple {
-  /** "Base class". */
-  PaddleMobile__Framework__ProtobufCBuffer base;
-  /** Number of bytes allocated in `data`. */
-  size_t alloced;
-  /** Number of bytes currently stored in `data`. */
-  size_t len;
-  /** Data bytes. */
-  uint8_t *data;
-  /** Whether `data` must be freed. */
-  protobuf_c_boolean must_free_data;
-  /** Allocator to use. May be NULL to indicate the system allocator. */
-  PaddleMobile__Framework__ProtobufCAllocator *allocator;
-};
-
-/**
- * Describes an enumeration as a whole, with all of its values.
- */
-struct PaddleMobile__Framework__ProtobufCEnumDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /** Number elements in `values`. */
-  unsigned n_values;
-  /** Array of distinct values, sorted by numeric value. */
-  const PaddleMobile__Framework__ProtobufCEnumValue *values;
-
-  /** Number of elements in `values_by_name`. */
-  unsigned n_value_names;
-  /** Array of named values, including aliases, sorted by name. */
-  const PaddleMobile__Framework__ProtobufCEnumValueIndex *values_by_name;
-
-  /** Number of elements in `value_ranges`. */
-  unsigned n_value_ranges;
-  /** Value ranges, for faster lookups by numeric value. */
-  const PaddleMobile__Framework__ProtobufCIntRange *value_ranges;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-  /** Reserved for future use. */
-  void *reserved4;
-};
-
-/**
- * Represents a single value of an enumeration.
- */
-struct PaddleMobile__Framework__ProtobufCEnumValue {
-  /** The string identifying this value in the .proto file. */
-  const char *name;
-
-  /** The string identifying this value in generated C code. */
-  const char *c_name;
-
-  /** The numeric value assigned in the .proto file. */
-  int value;
-};
-
-/**
- * Used by `PaddleMobile__Framework__ProtobufCEnumDescriptor` to look up enum
- * values.
- */
-struct PaddleMobile__Framework__ProtobufCEnumValueIndex {
-  /** Name of the enum value. */
-  const char *name;
-  /** Index into values[] array. */
-  unsigned index;
-};
-
-/**
- * Describes a single field in a message.
- */
-struct PaddleMobile__Framework__ProtobufCFieldDescriptor {
-  /** Name of the field as given in the .proto file. */
-  const char *name;
-
-  /** Tag value of the field as given in the .proto file. */
-  uint32_t id;
-
-  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
-  PaddleMobile__Framework__ProtobufCLabel label;
-
-  /** The type of the field. */
-  PaddleMobile__Framework__ProtobufCType type;
-
-  /**
-   * The offset in bytes of the message's C structure's quantifier field
-   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
-   * for repeated members or the case enum for oneofs).
-   */
-  unsigned quantifier_offset;
-
-  /**
-   * The offset in bytes into the message's C structure for the member
-   * itself.
-   */
-  unsigned offset;
-
-  /**
-   * A type-specific descriptor.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
-   * corresponding `PaddleMobile__Framework__ProtobufCEnumDescriptor`.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
-   * the corresponding `PaddleMobile__Framework__ProtobufCMessageDescriptor`.
-   *
-   * Otherwise this field is NULL.
-   */
-  const void *descriptor; /* for MESSAGE and ENUM types */
-
-  /** The default value for this field, if defined. May be NULL. */
-  const void *default_value;
-
-  /**
-   * A flag word. Zero or more of the bits defined in the
-   * `PaddleMobile__Framework__ProtobufCFieldFlag` enum may be set.
-   */
-  uint32_t flags;
-
-  /** Reserved for future use. */
-  unsigned reserved_flags;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * Helper structure for optimizing int => index lookups in the case
- * where the keys are mostly consecutive values, as they presumably are for
- * enums and fields.
- *
- * The data structures requires that the values in the original array are
- * sorted.
- */
-struct PaddleMobile__Framework__ProtobufCIntRange {
-  int start_value;
-  unsigned orig_index;
-  /*
-   * NOTE: the number of values in the range can be inferred by looking
-   * at the next element's orig_index. A dummy element is added to make
-   * this simple.
-   */
-};
-
-/**
- * An instance of a message.
- *
- * `PaddleMobile__Framework__ProtobufCMessage` is a light-weight "base class"
- * for all messages.
- *
- * In particular, `PaddleMobile__Framework__ProtobufCMessage` doesn't have any
- * allocation policy associated with it. That's because it's common to create
- * `PaddleMobile__Framework__ProtobufCMessage` objects on the stack. In fact,
- * that's what we recommend for sending messages. If the object is allocated
- * from the stack, you can't really have a memory leak.
- *
- * This means that calls to functions like
- * PaddleMobile__Framework__protobuf_c_message_unpack() which return a
- * `PaddleMobile__Framework__ProtobufCMessage` must be paired with a call to a
- * free function, like
- * PaddleMobile__Framework__protobuf_c_message_free_unpacked().
- */
-struct PaddleMobile__Framework__ProtobufCMessage {
-  /** The descriptor for this message type. */
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor;
-  /** The number of elements in `unknown_fields`. */
-  unsigned n_unknown_fields;
-  /** The fields that weren't recognized by the parser. */
-  PaddleMobile__Framework__ProtobufCMessageUnknownField *unknown_fields;
-};
-
-/**
- * Describes a message.
- */
-struct PaddleMobile__Framework__ProtobufCMessageDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /**
-   * Size in bytes of the C structure representing an instance of this
-   * type of message.
-   */
-  size_t sizeof_message;
-
-  /** Number of elements in `fields`. */
-  unsigned n_fields;
-  /** Field descriptors, sorted by tag number. */
-  const PaddleMobile__Framework__ProtobufCFieldDescriptor *fields;
-  /** Used for looking up fields by name. */
-  const unsigned *fields_sorted_by_name;
-
-  /** Number of elements in `field_ranges`. */
-  unsigned n_field_ranges;
-  /** Used for looking up fields by id. */
-  const PaddleMobile__Framework__ProtobufCIntRange *field_ranges;
-
-  /** Message initialisation function. */
-  ProtobufCMessageInit message_init;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * An unknown message field.
- */
-struct PaddleMobile__Framework__ProtobufCMessageUnknownField {
-  /** The tag number. */
-  uint32_t tag;
-  /** The wire type of the field. */
-  PaddleMobile__Framework__ProtobufCWireType wire_type;
-  /** Number of bytes in `data`. */
-  size_t len;
-  /** Field data. */
-  uint8_t *data;
-};
-
-/**
- * Method descriptor.
- */
-struct PaddleMobile__Framework__ProtobufCMethodDescriptor {
-  /** Method name. */
-  const char *name;
-  /** Input message descriptor. */
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *input;
-  /** Output message descriptor. */
-  const PaddleMobile__Framework__ProtobufCMessageDescriptor *output;
-};
-
-/**
- * Service.
- */
-struct PaddleMobile__Framework__ProtobufCService {
-  /** Service descriptor. */
-  const PaddleMobile__Framework__ProtobufCServiceDescriptor *descriptor;
-  /** Function to invoke the service. */
-  void (*invoke)(PaddleMobile__Framework__ProtobufCService *service,
-                 unsigned method_index,
-                 const PaddleMobile__Framework__ProtobufCMessage *input,
-                 ProtobufCClosure closure, void *closure_data);
-  /** Function to destroy the service. */
-  void (*destroy)(PaddleMobile__Framework__ProtobufCService *service);
-};
-
-/**
- * Service descriptor.
- */
-struct PaddleMobile__Framework__ProtobufCServiceDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** Service name. */
-  const char *name;
-  /** Short version of service name. */
-  const char *short_name;
-  /** C identifier for the service name. */
-  const char *c_name;
-  /** Package name. */
-  const char *package;
-  /** Number of elements in `methods`. */
-  unsigned n_methods;
-  /** Method descriptors, in the order defined in the .proto file. */
-  const PaddleMobile__Framework__ProtobufCMethodDescriptor *methods;
-  /** Sort index of methods. */
-  const unsigned *method_indices_by_name;
-};
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A string containing the version number of protobuf-c.
- */
-PROTOBUF_C__API
-const char *PaddleMobile__Framework__protobuf_c_version(void);
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A 32 bit unsigned integer containing the version number of
- *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
- */
-PROTOBUF_C__API
-uint32_t PaddleMobile__Framework__protobuf_c_version_number(void);
-
-/**
- * The version of the protobuf-c headers, represented as a string using the same
- * format as PaddleMobile__Framework__protobuf_c_version().
- */
-#define PROTOBUF_C_VERSION "1.3.0"
-
-/**
- * The version of the protobuf-c headers, represented as an integer using the
- * same format as PaddleMobile__Framework__protobuf_c_version_number().
- */
-#define PROTOBUF_C_VERSION_NUMBER 1003000
-
-/**
- * The minimum protoc-c version which works with the current version of the
- * protobuf-c headers.
- */
-#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
-
-/**
- * Determine the number of bytes required to store the serialised message.
- *
- * \param message
- *      The message object to serialise.
- * \return
- *      Number of bytes.
- */
-PROTOBUF_C__API
-size_t PaddleMobile__Framework__protobuf_c_message_get_packed_size(
-    const PaddleMobile__Framework__ProtobufCMessage *message);
-
-/**
- * Unpack a serialised message into an in-memory representation.
- *
- * \param descriptor
- *      The message descriptor.
- * \param allocator
- *      `PaddleMobile__Framework__ProtobufCAllocator` to use for memory
- * allocation. May be NULL to specify the default allocator. \param len Length
- * in bytes of the serialised message. \param data Pointer to the
- * serialised message. \return An unpacked message object. \retval NULL If
- * an error occurred during unpacking.
- */
-PROTOBUF_C__API
-PaddleMobile__Framework__ProtobufCMessage *
-PaddleMobile__Framework__protobuf_c_message_unpack(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator, size_t len,
-    const uint8_t *data);
-
-/**
- * Free an unpacked message object.
- *
- * This function should be used to deallocate the memory used by a call to
- * PaddleMobile__Framework__protobuf_c_message_unpack().
- *
- * \param message
- *      The message object to free. May be NULL.
- * \param allocator
- *      `PaddleMobile__Framework__ProtobufCAllocator` to use for memory
- * deallocation. May be NULL to specify the default allocator.
- */
-PROTOBUF_C__API
-void PaddleMobile__Framework__protobuf_c_message_free_unpacked(
-    PaddleMobile__Framework__ProtobufCMessage *message,
-    PaddleMobile__Framework__ProtobufCAllocator *allocator);
-
-/**
- * Check the validity of a message object.
- *
- * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
- * Recursively checks nested messages.
- *
- * \retval TRUE
- *      Message is valid.
- * \retval FALSE
- *      Message is invalid.
- */
-PROTOBUF_C__API
-protobuf_c_boolean PaddleMobile__Framework__protobuf_c_message_check(
-    const PaddleMobile__Framework__ProtobufCMessage *);
-
-/** Message initialiser. */
-#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
-  { descriptor, 0, NULL }
-
-/**
- * Initialise a message object from a message descriptor.
- *
- * \param descriptor
- *      Message descriptor.
- * \param message
- *      Allocated block of memory of size `descriptor->sizeof_message`.
- */
-PROTOBUF_C__API
-void PaddleMobile__Framework__protobuf_c_message_init(
-    const PaddleMobile__Framework__ProtobufCMessageDescriptor *descriptor,
-    void *message);
-
-/**
- * Initialise a `PaddleMobile__Framework__ProtobufCBufferSimple` object.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)           \
-  {                                                             \
-    {PaddleMobile__Framework__protobuf_c_buffer_simple_append}, \
-        sizeof(array_of_bytes), 0, (array_of_bytes), 0, NULL    \
-  }
-
-/**
- * Clear a `PaddleMobile__Framework__ProtobufCBufferSimple` object, freeing any
- * allocated memory.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
-  do {                                                                        \
-    if ((simp_buf)->must_free_data) {                                         \
-      if ((simp_buf)->allocator != NULL)                                      \
-        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
-      else                                                                    \
-        free((simp_buf)->data);                                               \
-    }                                                                         \
-  } while (0)
-
-/**
- * The `append` method for `PaddleMobile__Framework__ProtobufCBufferSimple`.
- *
- * \param buffer
- *      The buffer object to append to. Must actually be a
- *      `PaddleMobile__Framework__ProtobufCBufferSimple` object.
- * \param len
- *      Number of bytes in `data`.
- * \param data
- *      Data to append.
- */
-PROTOBUF_C__API
-void PaddleMobile__Framework__protobuf_c_buffer_simple_append(
-    PaddleMobile__Framework__ProtobufCBuffer *buffer, size_t len,
-    const unsigned char *data);
-
-/**@}*/
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_H */
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
deleted file mode 100644
index 9fbf33da90f3eba4738cf6118aeb0bd6afe03553..0000000000000000000000000000000000000000
--- a/mobile/test/CMakeLists.txt
+++ /dev/null
@@ -1,578 +0,0 @@
-set(dir ${CMAKE_CURRENT_SOURCE_DIR})
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${dir}/build")
-set(FOUND_MATCH OFF)
-set(ENABLE_ALL_TEST ON)
-
-if (ANDROID_ABI STREQUAL "arm64-v8a")
-    message("using google's linker to link armv8 binary")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fuse-ld=gold")
-endif ()
-
-set(CON -1)
-
-message(STATUS "nets :${NET}")
-
-list(FIND NET "net" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-net paddle-mobile)
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "googlenet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-googlenet paddle-mobile)
-
-    # gen test
-    ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-googlenet-quali paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "mobilenet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-mobilenet paddle-mobile)
-
-    # gen test
-    ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-mobilenet-combine paddle-mobile)
-    set(FOUND_MATCH ON)
-
-    # gen test
-    ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
-    target_link_libraries(test-mobilenetgpu paddle-mobile)
-
-endif ()
-
-list(FIND NET "yolo" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-yolo paddle-mobile)
-    # gen test
-    ADD_EXECUTABLE(test-yolo-combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-yolo-combined paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "squeezenet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-squeezenet paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "resnet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "FPGA_NET_V1" CON)
-if (CON GREATER -1)
-    #ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-resnet50 paddle-mobile)
-
-    #ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-densebox paddle-mobile)
-
-    #ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-rfcn paddle-mobile)
-
-    #ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-marker paddle-mobile)
-
-    ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp)
-    target_link_libraries(test-rfcn-api paddle-mobile)
-
-    ADD_EXECUTABLE(test-mobilenet-api fpga/test_mobilenet_api.cpp)
-    target_link_libraries(test-mobilenet-api paddle-mobile)
-
-    ADD_EXECUTABLE(test-yolo-api fpga/test_yolo_api.cpp)
-    target_link_libraries(test-yolo-api paddle-mobile)
-
-    ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
-    target_link_libraries(test-marker-api paddle-mobile)
-
-    #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
-    #target_link_libraries(test-marker2 paddle-mobile)
-
-    #ADD_EXECUTABLE(test-mobilenet fpga/test_mobilenet_beijing.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-mobilenet paddle-mobile)
-
-    #ADD_EXECUTABLE(test-yolo fpga/test_yolo_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    #target_link_libraries(test-yolo paddle-mobile)
-
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "FPGA_NET_V2" CON)
-if (CON GREATER -1)
-    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-resnet50 paddle-mobile)
-
-    ADD_EXECUTABLE(test-pe fpga/test_pe.cpp)
-    target_link_libraries(test-pe paddle-mobile)
-
-    ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-densebox paddle-mobile)
-
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "FPGA_OPS_KD" CON)
-if (CON GREATER -1)
-    ADD_EXECUTABLE(test-ssd fpga/test_ssd.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-ssd paddle-mobile)
-
-    set(FOUND_MATCH ON)
-endif ()
-
-list(FIND NET "mobilenetssd" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-mobilenetssd paddle-mobile)
-
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "nlp" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-nlp paddle-mobile)
-
-    # gen test
-    ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
-    target_link_libraries(test-gru-op paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "mobilenetfssd" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
-    target_link_libraries(test-fssd paddle-mobile)
-
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "genet" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-genet paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "super" CON)
-if (CON GREATER -1)
-    # gen test
-    ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-super paddle-mobile)
-    set(FOUND_MATCH ON)
-
-endif ()
-
-list(FIND NET "op" CON)
-if (CON GREATER -1)
-    #    # gen test
-    #    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
-    #    target_link_libraries(test-sigmoid paddle-mobile)
-    #
-    #    # gen test log
-    #    ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
-    #    target_link_libraries(test-leakyrelu paddle-mobile)
-    # gen test log
-    ADD_EXECUTABLE(test-log common/test_log.cpp)
-    target_link_libraries(test-log paddle-mobile)
-    set(FOUND_MATCH ON)
-endif ()
-
-if (ENABLE_ALL_TEST)
-    if (NOT FOUND_MATCH)
-        # gen test
-        ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-resnet paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-squeezenet paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-yolo paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test_yolo_combined paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-op-in-net paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-googlenet paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-googlenet-quali paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-conv-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-expend-op operators/test_expend_op.cpp test_helper.h test_include.h executor_for_test_opencl.h)
-        target_link_libraries(test-expend-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-mul-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-elementwiseadd-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-elementwisesub-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-im2sequence-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-concat-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-lrn-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-batchnorm-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-priorbox-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-boxcoder-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-transpose-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-transpose2-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-multiclassnms-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-polygon-box-transform-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-fill-constant-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-reshape-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-reshape2-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-relu-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-relu6-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-tanh-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-log-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-topk-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-cast-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-less-than-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-fc-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sum-op paddle-mobile)
-
-        # test quantize op
-        ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-quantize-op paddle-mobile)
-
-        # test dequantize op
-        ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-dequantize-op paddle-mobile)
-
-        # gen test log
-        ADD_EXECUTABLE(test-log common/test_log.cpp)
-        target_link_libraries(test-log paddle-mobile)
-
-        # gen test log
-        ADD_EXECUTABLE(test-load framework/test_load.cpp)
-        target_link_libraries(test-load paddle-mobile)
-
-        # gen test log
-        ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
-        target_link_libraries(test-loadmemory paddle-mobile)
-
-        # gen test log
-        ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp)
-        target_link_libraries(test-loadmemory-inference paddle-mobile)
-
-        ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
-        target_link_libraries(test-inference-api paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
-        target_link_libraries(test-optimize paddle-mobile)
-
-        #gen test
-        ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-pool-op paddle-mobile)
-
-        #gen test
-        ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-softmax-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
-        target_link_libraries(test-gemm-accuracy paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
-        target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
-        target_link_libraries(test-gemm-perf paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
-        target_link_libraries(test-enforce paddle-mobile)
-
-        # gen test - test if openmp works
-        ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-openmp paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-mobilenetssd paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-mobilenet-combine paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-genet paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h)
-        target_link_libraries(test-sigmoid-op paddle-mobile)
-
-        # gen test log
-        ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
-        target_link_libraries(test-leakyrelu paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-depthwise-conv-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-mobilenet paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-conv-add-relu-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-nlp paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-gru-op paddle-mobile)
-
-        # gen test
-
-        ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-inceptionv4 paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-alexnet paddle-mobile)
-
-        ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
-        target_link_libraries(test-googlenetv1 paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
-        target_link_libraries(test-fssd paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
-        target_link_libraries(test-mobilenetgpu paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-yologpu paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
-        target_link_libraries(test-multi-process paddle-mobile)
-
-        # gen test benchmark
-        ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
-        target_link_libraries(test-benchmark paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
-        target_link_libraries(test-eng paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
-        target_link_libraries(test-super paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
-        target_link_libraries(test-ocr paddle-mobile)
-
-        ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h)
-        target_link_libraries(test-gesture paddle-mobile)
-
-        ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sequence-expand-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sequence-pool-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-sequence-softmax-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h)
-        target_link_libraries(test-vgg16ssd paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-and-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-or-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-not-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-logical-xor-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-increment-op paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-is-empty-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-conv-bn-relu-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
-        target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
-
-        ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
-        target_link_libraries(test-conv-gpu paddle-mobile)
-
-        ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
-        target_link_libraries(test-net-benchmark paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-net paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-net-feeds net/test_net_multi_feed.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-net-feeds paddle-mobile)
-
-        # gen test
-        ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-net-performance paddle-mobile)
-
-        ADD_EXECUTABLE(test-infer-imfix net/test_inference_imfix.cpp test_helper.h test_include.h executor_for_test.h)
-        target_link_libraries(test-infer-imfix paddle-mobile)
-
-#        ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
-#        target_link_libraries(test-inference-api-v2 paddle-mobile)
-
-        if (GPU_CL)
-            ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h)
-            target_link_libraries(test-net-male2fe paddle-mobile)
-
-            ADD_EXECUTABLE(test-infer-m2fm net/test_inference_m2fm.cpp test_helper.h test_include.h executor_for_test.h)
-            target_link_libraries(test-infer-m2fm  paddle-mobile)
-
-        endif()
-
-    endif ()
-else ()
-    # gen test
-    ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-net paddle-mobile)
-
-    ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
-    target_link_libraries(test-net-benchmark paddle-mobile)
-
-#    ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
-#    target_link_libraries(test-inference-api-v2 paddle-mobile)
-endif ()
diff --git a/mobile/test/common/test_enforce.cpp b/mobile/test/common/test_enforce.cpp
deleted file mode 100644
index 9bb499315da33c19ebcf010f87fd067c16f2710e..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_enforce.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/enforce.h"
-
-int main() {
-  PADDLE_MOBILE_ENFORCE(false, "enforce");
-  PADDLE_MOBILE_THROW_EXCEPTION("throw a exception");
-  return 0;
-}
diff --git a/mobile/test/common/test_gemm_accuracy.cpp b/mobile/test/common/test_gemm_accuracy.cpp
deleted file mode 100644
index fc1041bde0b2bc78d809435a5487052c3db95a5d..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_gemm_accuracy.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstdlib>
-#include <ctime>
-#include <iostream>
-#include "../test_helper.h"
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm/cblas.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-void print_matrix(int m, int n, int ldc, float *c) {
-  for (int i = 0; i < m; ++i) {
-    std::cout << c(i, 0);
-    for (int j = 1; j < n; ++j) {
-      std::cout << " | " << c(i, j);
-    }
-    std::cout << std::endl;
-  }
-  std::cout << std::endl;
-}
-
-int do_sgemm(int m, int n, int k, int pr) {
-  const float alpha = 1.f;
-  const float beta = 0.f;
-  const int lda = k;
-  const int ldb = n;
-  const int ldc = n;
-
-  float *a =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * k));
-  float *b =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * k * n));
-  float *c =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-  float *c1 =
-      static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * m * n));
-
-  std::mt19937 rng(111);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-  const float lower = -10.f;
-  const float upper = 10.f;
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = static_cast<float>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = static_cast<float>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-  memcpy(c, c1, sizeof(float) * m * n);
-
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      float r = 0;
-      for (int p = 0; p < k; p++) {
-        r += a(i, p) * b(p, j);
-      }
-      c1(i, j) = alpha * r;
-    }
-  }
-
-  std::cout << "run cblas_sgemm..." << std::endl;
-  paddle_mobile::operators::math::cblas_sgemm(false, false, m, n, k, alpha, a,
-                                              lda, b, ldb, 0.f, c, ldc);
-
-  std::cout << "compare results..." << std::endl;
-  for (int i = 0; i < m * n; ++i) {
-    if (abs(c[i] - c1[i]) >= 1e-2) {
-      std::cout << "c[" << i << "] != c1[" << i << "]: " << c[i] << " vs "
-                << c1[i] << std::endl;
-      exit(1);
-    }
-  }
-
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matrix(m, k, lda, a);
-    std::cout << "B:" << std::endl;
-    print_matrix(k, n, ldb, b);
-    std::cout << "C:" << std::endl;
-    print_matrix(m, n, ldc, c);
-    std::cout << "C1:" << std::endl;
-    print_matrix(m, n, ldc, c1);
-  }
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-
-  return 0;
-}
-
-int main(int argc, char *argv[]) {
-  do_sgemm(1, 1, 1, 1);
-
-  do_sgemm(9, 9, 1, 1);
-  do_sgemm(999, 99, 1, 0);
-  do_sgemm(999, 1, 1, 0);
-  do_sgemm(1, 9, 9, 1);
-  do_sgemm(1, 99, 999, 0);
-  do_sgemm(1, 1, 999, 0);
-
-  do_sgemm(9, 9, 9, 1);
-  do_sgemm(10, 6, 12, 1);
-  do_sgemm(512, 256, 384, 0);
-  do_sgemm(1366, 768, 256, 0);
-  do_sgemm(1255, 755, 333, 0);
-  do_sgemm(555, 777, 999, 0);
-
-  do_sgemm(10, 6, 12, 1);
-  do_sgemm(512, 256, 384, 0);
-  do_sgemm(1366, 768, 256, 0);
-  do_sgemm(1255, 755, 333, 0);
-  do_sgemm(555, 777, 999, 0);
-
-  return 0;
-}
diff --git a/mobile/test/common/test_gemm_int8_accuracy.cpp b/mobile/test/common/test_gemm_int8_accuracy.cpp
deleted file mode 100644
index 7d20a178c11757cebc65e2c99bcaffe5ec2c70ce..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_gemm_int8_accuracy.cpp
+++ /dev/null
@@ -1,346 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstdlib>
-#include <ctime>
-#include <iostream>
-#include <limits>
-#include <random>
-#include <type_traits>
-#include "../test_helper.h"
-#include "common/log.h"
-#include "memory/t_malloc.h"
-#include "operators/math/gemm.h"
-#ifdef _OPENMP
-#include <omp.h>
-#endif  // _OPENMP
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-using std::default_random_engine;
-using std::uniform_int_distribution;
-
-template <typename T>
-void print_matrix(int m, int n, int ldc, T *c) {
-  for (int i = 0; i < m; ++i) {
-    if (std::is_same<T, int8_t>::value) {
-      std::cout.setf(std::ios::left);
-      std::cout.width(4);
-      std::cout << static_cast<int32_t>(c(i, 0));
-    } else {
-      std::cout.setf(std::ios::left);
-      std::cout.width(6);
-      std::cout << c(i, 0);
-    }
-    for (int j = 1; j < n; ++j) {
-      if (std::is_same<T, int8_t>::value) {
-        std::cout << " | ";
-        std::cout.setf(std::ios::left);
-        std::cout.width(4);
-        std::cout << static_cast<int32_t>(c(i, j));
-      } else {
-        std::cout << " | ";
-        std::cout.setf(std::ios::left);
-        std::cout.width(6);
-        std::cout << c(i, j);
-      }
-    }
-    std::cout << "\n";
-  }
-  std::cout << std::endl;
-}
-
-int32_t qadd_int32(int32_t l, int32_t r) {
-  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > std::numeric_limits<int32_t>::max())
-    return std::numeric_limits<int32_t>::max();
-  else if (res < std::numeric_limits<int32_t>::min())
-    return std::numeric_limits<int32_t>::min();
-  else
-    return static_cast<int32_t>(res);
-}
-
-// round to zero
-float round2zero(float v) {
-  float res;
-  if (v > 0)
-    res = std::floor(v);
-  else if (v < 0)
-    res = std::ceil(v);
-  return res;
-}
-
-int8_t qscale_int32(int32_t v, float scale) {
-  float res = static_cast<float>(v) * scale;
-  res = round2zero(res);
-  if (res > 127)
-    return static_cast<int8_t>(127);
-  else if (res < -127)
-    return static_cast<int8_t>(-127);
-  else
-    return static_cast<int8_t>(res);
-}
-
-int do_sgemm(int m, int n, int k, bool relu, int pr) {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  default_random_engine e;
-  uniform_int_distribution<int8_t> pixel(-127, 127);
-  int8_t *a = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
-  int8_t *b = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
-  int32_t *c = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
-  int32_t *c1 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * m * n));
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = pixel(e);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = pixel(e);
-  }
-
-  for (int i = 0; i < m; ++i) {
-    for (int j = 0; j < n; ++j) {
-      int32_t r = 0;
-      for (int p = 0; p < k; p++) {
-        r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-      }
-      c1(i, j) = r;
-    }
-  }
-
-  paddle_mobile::operators::math::Gemm gemm;
-#ifdef _OPENMP
-  gemm.Sgemm_omp(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
-                 static_cast<int8_t>(0), c, ldc, relu, nullptr);
-#else
-  gemm.Sgemm(m, n, k, static_cast<int8_t>(1), a, lda, b, ldb,
-             static_cast<int8_t>(0), c, ldc, relu, nullptr);
-#endif
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < m * n; ++i) {
-    if (c[i] == c1[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matrix(m, k, lda, a);
-    std::cout << "B:" << std::endl;
-    print_matrix(k, n, ldb, b);
-    std::cout << "C:" << std::endl;
-    print_matrix(m, n, ldc, c);
-    std::cout << "C1:" << std::endl;
-    print_matrix(m, n, ldc, c1);
-  }
-
-  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
-            << "   eq=" << eq << " neq=" << neq << std::endl;
-
-  PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-
-  return 0;
-}
-
-int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr,
-                       bool addOnRow = false) {
-  int lda = k;
-  int ldb = n;
-  int ldc = n;
-  float scale = 1;
-  default_random_engine e;
-  uniform_int_distribution<int8_t> pixel(-127, 127);
-  int8_t *a = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
-  int8_t *b = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
-  int8_t *c = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
-  int8_t *c1 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
-
-  int32_t *bias = nullptr;
-  if (addOnRow) {
-    bias = static_cast<int32_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int32_t) * n));
-  } else {
-    bias = static_cast<int32_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
-  }
-
-  for (int i = 0; i < m * k; ++i) {
-    a[i] = pixel(e);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    b[i] = pixel(e);
-  }
-
-  if (addOnRow) {
-    for (int i = 0; i < n; ++i) {
-      bias[i] = static_cast<int32_t>(pixel(e));
-    }
-    for (int i = 0; i < m; ++i) {
-      for (int j = 0; j < n; ++j) {
-        int32_t bias_v = bias[j];
-        int32_t r = 0;
-        for (int p = 0; p < k; p++) {
-          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-        }
-        r = qadd_int32(r, bias_v);
-        if (relu) r = std::max(0, r);
-        c1(i, j) = qscale_int32(r, scale);
-      }
-    }
-  } else {
-    for (int i = 0; i < m; ++i) {
-      bias[i] = static_cast<int32_t>(pixel(e));
-    }
-    for (int i = 0; i < m; ++i) {
-      int32_t bias_v = bias[i];
-      for (int j = 0; j < n; ++j) {
-        int32_t r = 0;
-        for (int p = 0; p < k; p++) {
-          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-        }
-        r = qadd_int32(r, bias_v);
-        if (relu) r = std::max(0, r);
-        c1(i, j) = qscale_int32(r, scale);
-      }
-    }
-  }
-
-  paddle_mobile::operators::math::Gemm gemm;
-#ifdef _OPENMP
-  gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
-                 relu, bias, addOnRow);
-#else
-  gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
-             relu, bias, addOnRow);
-#endif
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < m * n; ++i) {
-    if (c[i] == c1[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-
-  if (pr > 0) {
-    std::cout << "A:" << std::endl;
-    print_matrix(m, k, lda, a);
-    std::cout << "B:" << std::endl;
-    print_matrix(k, n, ldb, b);
-    std::cout << "Bias:" << std::endl;
-    if (addOnRow) {
-      print_matrix(1, n, n, bias);
-    } else {
-      print_matrix(m, 1, 1, bias);
-    }
-    std::cout << "C:" << std::endl;
-    print_matrix(m, n, ldc, c);
-    std::cout << "C1:" << std::endl;
-    print_matrix(m, n, ldc, c1);
-  }
-
-  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
-            << "   eq=" << eq << " neq=" << neq << std::endl;
-
-  PADDLE_MOBILE_ENFORCE(neq == 0,
-                        "The execution of do_sgemm_with_bias is failed!");
-
-  paddle_mobile::memory::Free(a);
-  paddle_mobile::memory::Free(b);
-  paddle_mobile::memory::Free(c);
-  paddle_mobile::memory::Free(c1);
-  paddle_mobile::memory::Free(bias);
-
-  return 0;
-}
-
-int main() {
-#ifdef _OPENMP
-  omp_set_num_threads(4);
-#endif
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm without bias:" << std::endl;
-  do_sgemm(9, 9, 9, false, 1);
-  do_sgemm(10, 6, 12, false, 0);
-  do_sgemm(512, 256, 384, false, 0);
-  do_sgemm(1366, 768, 256, false, 0);
-  do_sgemm(1255, 755, 333, false, 0);
-  do_sgemm(599, 1133, 393, false, 0);
-  do_sgemm(777, 555, 999, false, 0);
-  do_sgemm(333, 797, 939, false, 0);
-  do_sgemm(1024, 1024, 1024, false, 0);
-
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm with bias(bias is added on column):" << std::endl;
-  do_sgemm_with_bias(9, 9, 9, false, 1);
-  do_sgemm_with_bias(10, 6, 12, false, 0);
-  do_sgemm_with_bias(512, 256, 384, false, 0);
-  do_sgemm_with_bias(1366, 768, 256, false, 0);
-  do_sgemm_with_bias(1255, 755, 333, false, 0);
-  do_sgemm_with_bias(599, 1133, 393, false, 0);
-  do_sgemm_with_bias(777, 555, 999, false, 0);
-  do_sgemm_with_bias(333, 797, 939, false, 0);
-  do_sgemm_with_bias(1024, 1024, 1024, false, 0);
-
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm with bias(bias is added on row):" << std::endl;
-  do_sgemm_with_bias(9, 9, 9, false, 1, true);
-  do_sgemm_with_bias(10, 6, 12, false, 0, true);
-  do_sgemm_with_bias(512, 256, 384, false, 0, true);
-  do_sgemm_with_bias(1366, 768, 256, false, 0, true);
-  do_sgemm_with_bias(1255, 755, 333, false, 0, true);
-  do_sgemm_with_bias(599, 1133, 393, false, 0, true);
-  do_sgemm_with_bias(777, 555, 999, false, 0, true);
-  do_sgemm_with_bias(333, 797, 939, false, 0, true);
-  do_sgemm_with_bias(1024, 1024, 1024, false, 0, true);
-
-  std::cout << "\n\n******************************************************\n\n"
-            << std::endl;
-  std::cout << "Test gemm with relu and bias:" << std::endl;
-  do_sgemm_with_bias(9, 9, 9, true, 1);
-  do_sgemm_with_bias(10, 6, 12, true, 0);
-  do_sgemm_with_bias(512, 256, 384, true, 0);
-  do_sgemm_with_bias(1366, 768, 256, true, 0);
-  do_sgemm_with_bias(1255, 755, 333, true, 0);
-  do_sgemm_with_bias(599, 1133, 393, true, 0);
-  do_sgemm_with_bias(777, 555, 999, true, 0);
-  do_sgemm_with_bias(333, 797, 939, true, 0);
-  do_sgemm_with_bias(1024, 1024, 1024, true, 0);
-
-  return 0;
-}
diff --git a/mobile/test/common/test_gemm_perf.cpp b/mobile/test/common/test_gemm_perf.cpp
deleted file mode 100644
index c88a65625dc03d6e0e1e6a2575a2645e64ab1605..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_gemm_perf.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/math/gemm.h"
-#include "operators/math/math_function.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c1(i, j) c1[(i)*ldc + (j)]
-
-#define m 1024
-#define n 1024
-#define k 1024
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  Tensor aa, bb, cc;
-  auto aaptr = aa.mutable_data<float>({m, k});
-  auto bbptr = bb.mutable_data<float>({k, n});
-  auto ccptr = cc.mutable_data<float>({m, n});
-
-  for (int i = 0; i < m * k; ++i) {
-    aaptr[i] = 2;
-  }
-  for (int i = 0; i < k * n; ++i) {
-    bbptr[i] = 2;
-  }
-  for (int i = 0; i < m * n; ++i) {
-    ccptr[i] = 2;
-  }
-
-  Tensor aa_int8, bb_int8, cc_int32, cc_int8;
-  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
-  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
-  auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
-  auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
-  int32_t* bias_data_col = new int32_t[m];
-  int32_t* bias_data_row = new int32_t[n];
-
-  for (int i = 0; i < m * k; ++i) {
-    aaptr_int8[i] = static_cast<int8_t>(2);
-  }
-  for (int i = 0; i < k * n; ++i) {
-    bbptr_int8[i] = static_cast<int8_t>(2);
-  }
-  for (int i = 0; i < m * n; ++i) {
-    ccptr_int32[i] = static_cast<int32_t>(2);
-  }
-
-  for (int i = 0; i < m; ++i) {
-    bias_data_col[i] = 2;
-  }
-
-  for (int i = 0; i < n; ++i) {
-    bias_data_row[i] = 2;
-  }
-
-  // float
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<float, float>(
-        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, nullptr);
-  }
-
-  auto time_start0 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<float, float>(
-        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
-        false, nullptr);
-  }
-  auto time_end0 = time();
-  std::cout << "float gemm  cost :" << time_diff(time_start0, time_end0) / 10
-            << "ms\n";
-
-  // int8_t without bias
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
-        static_cast<float>(0));
-  }
-
-  auto time_start1 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
-        static_cast<float>(0));
-  }
-  auto time_end1 = time();
-  std::cout << "int8_t gemm  cost :" << time_diff(time_start1, time_end1) / 10
-            << "ms\n";
-
-  // int8_t with bias, column element wise add
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_col, false);
-  }
-  auto time_start2 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_col, false);
-  }
-  auto time_end2 = time();
-  std::cout << "int8_t gemm_with_bias(column add) cost :"
-            << time_diff(time_start2, time_end2) / 10 << "ms\n";
-
-  // int8_t with bias, row element wise add
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_row, true);
-  }
-  auto time_start3 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), false, bias_data_row, true);
-  }
-  auto time_end3 = time();
-  std::cout << "int8_t gemm_with_bias(row add) cost :"
-            << time_diff(time_start3, time_end3) / 10 << "ms\n";
-
-  // int8_t with bias&relu
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), true, bias_data_col, false);
-  }
-  auto time_start4 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::MatMul<int8_t, int32_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
-        static_cast<float>(0), true, bias_data_col, false);
-  }
-  auto time_end4 = time();
-  std::cout << "int8_t gemm_with_bias_relu cost :"
-            << time_diff(time_start4, time_end4) / 10 << "ms\n";
-
-  delete[] bias_data_row;
-  delete[] bias_data_col;
-
-  return 0;
-}
diff --git a/mobile/test/common/test_lib_size.cpp b/mobile/test/common/test_lib_size.cpp
deleted file mode 100644
index 805668f359f0e0959ea7122f25cdaa0ad2d7ec77..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_lib_size.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by liuRuiLong on 2018/6/6.
-//
-
-#include "test_lib_size.h"
-
-static test_lib_size t;
diff --git a/mobile/test/common/test_lib_size.h b/mobile/test/common/test_lib_size.h
deleted file mode 100644
index a00a5afe12f952a7bc47ab62ba1d07a7879cebec..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_lib_size.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by liuRuiLong on 2018/6/6.
-//
-
-#ifndef PADDLE_MOBILE_TEST_LIB_SIZE_H
-#define PADDLE_MOBILE_TEST_LIB_SIZE_H
-
-#include <pthread.h>
-#include <thread>
-#include <vector>
-//#include <list>
-//#include <tuple>
-//#include <typeinfo>
-//#include <mutex>
-//#include <initializer_list>
-//#include <map>
-//#include <string>
-//#include <unordered_map>
-//#include <unordered_set>
-//#include <algorithm>
-
-//#include <iostream>
-//#include <sstream>
-//#include <memory>
-//#include <stdio.h>
-//#include <cstring>
-
-void foo() {
-  //  char *str = "1234";
-  //  char dst[10];
-  //  strcpy(dst, str);
-
-  //  std::cout << "12345" << std::endl;
-  std::vector<int> vec = {1, 2, 3, 4, 5};
-  vec.push_back(2);
-
-  pthread_mutex_init(NULL, NULL);
-  pthread_attr_destroy(NULL);
-  //  std::find(vec.begin(), vec.end(), 1);
-
-  //  std::list<int> l;
-  //  std::mutex mutex_;
-
-  //  std::map<int, float> m;
-  //  std::unordered_map<int, float> u_m;
-  //  std::unordered_set<int> u_s;
-  //  std::string ss = "12345";
-  //  printf("%f", ss.c_str());
-
-  //  std::initializer_list<int> init_list = {1, 2};
-  //  std::tuple<int, int> t = {1, 2};
-
-  //  std::tuple_element<I, std::tuple<ARGS...>>::type
-
-  //  std::tuple<>
-
-  //  int i;
-  //  int j;
-  //  if (typeid(i) == typeid(j)){
-  //    int z = 10;
-  //  }
-
-  //  std::shared_ptr<int> s1 = std::make_shared<int>();
-
-  //  std::stringstream ss;
-  //  ss << "12345";
-}
-
-class test_lib_size {
- public:
-  test_lib_size() {}
-  //  std::shared_ptr<int> Test(){
-  //    std::vector<int> vec = {1, 2, 3};
-  //    std::shared_ptr<int> si = std::make_shared<int>();
-  //    return si;
-  //  }
-
-  //  void test(){
-  //    int i = 9;
-  //  }
-};
-
-#endif  // PADDLE_MOBILE_TEST_LIB_SIZE_H
diff --git a/mobile/test/common/test_log.cpp b/mobile/test/common/test_log.cpp
deleted file mode 100644
index 7ba964d18b4322d6c273bbcddd432d3e638efd22..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_log.cpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "common/log.h"
-
-int main() {
-  LOG(paddle_mobile::kLOG_DEBUG3) << "test debug"
-                                  << " next log";
-  LOG(paddle_mobile::kLOG_DEBUG) << "test debug"
-                                 << " next log";
-
-  LOG(paddle_mobile::kLOG_DEBUG1) << "test debug1"
-                                  << " next log";
-  LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2"
-                                  << " next log";
-  LOG(paddle_mobile::kLOG_INFO) << "INFO!!!";
-  LOG(paddle_mobile::kLOG_WARNING) << "WARNING!!!";
-  LOG(paddle_mobile::kLOG_VERBOSE) << "VERBOSE!!!";
-  DLOG << "test DLOG";
-
-  LOG(paddle_mobile::kLOG_ERROR) << "ERROR !";
-
-  return 0;
-}
diff --git a/mobile/test/common/test_openmp.cpp b/mobile/test/common/test_openmp.cpp
deleted file mode 100644
index 790c434101e20478853b7079533403d65dc829ba..0000000000000000000000000000000000000000
--- a/mobile/test/common/test_openmp.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//#include <omp.h>
-#include <iostream>
-
-int main(void) {
-#ifdef PADDLE_MOBILE_USE_OPENMP
-  #pragma omp parallel num_threads(2)
-  {
-    //        int thread_id = omp_get_thread_num();
-    //        int nthreads = omp_get_num_threads();
-    //        std::cout << "Hello, OMP " << thread_id << "/" << nthreads <<
-    //        "\n";
-  }
-#endif
-  return 0;
-}
diff --git a/mobile/test/executor_for_test.h b/mobile/test/executor_for_test.h
deleted file mode 100644
index 0a67eea5d5da3f8c7f155768640e7ba53b89abee..0000000000000000000000000000000000000000
--- a/mobile/test/executor_for_test.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "common/log.h"
-#include "framework/executor.h"
-#include "framework/op_registry.h"
-#include "operators/activation_op.h"
-#include "operators/conv_op.h"
-#include "operators/elementwise_add_op.h"
-#include "operators/pool_op.h"
-#include "operators/reshape_op.h"
-#include "operators/softmax_op.h"
-#include "operators/transpose_op.h"
-
-using paddle_mobile::framework::BlockDesc;
-using paddle_mobile::framework::DDim;
-using paddle_mobile::framework::Executor;
-using paddle_mobile::framework::LoDTensor;
-using paddle_mobile::framework::OpDesc;
-using paddle_mobile::framework::Program;
-using paddle_mobile::framework::Tensor;
-using paddle_mobile::framework::Variable;
-using std::string;
-using std::vector;
-
-template <typename DeviceType, typename OpType>
-class Executor4Test : public Executor<DeviceType> {
- public:
-  Executor4Test(Program<DeviceType> p, string op_type,
-                bool use_optimize = false)
-      : Executor<DeviceType>() {
-    this->use_optimize_ = use_optimize;
-    this->program_ = p;
-    if (this->use_optimize_) {
-      this->program_desc_ = this->program_.optimizeProgram;
-    } else {
-      this->program_desc_ = this->program_.originProgram;
-    }
-
-    if (this->program_.originProgram == nullptr) {
-      LOG(paddle_mobile::LogLevel::kLOG_ERROR) << "program_desc_ == nullptr";
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> &blocks =
-        this->program_desc_->Blocks();
-    std::vector<std::shared_ptr<OpDesc>> ops = blocks[0]->Ops();
-    for (int i = 0; i < ops.size(); ++i) {
-      auto op = ops[i];
-      if (op->Type() == op_type) {
-        DLOG << "匹配到: " << op->Type();
-
-        /// test first meeting op in program
-        std::shared_ptr<paddle_mobile::framework::OperatorBase<DeviceType>>
-            op_ptr = paddle_mobile::framework::OpRegistry<DeviceType>::CreateOp(
-                op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
-                this->program_.scope.get());
-        this->ops_of_block0_.push_back(op_ptr);
-        break;
-      }
-    }
-    if (this->program_.combined) {
-      this->InitCombineMemory();
-    } else {
-      this->InitMemory();
-    }
-    for (const auto &op : this->ops_of_block0_) {
-      op->Init();
-    }
-  }
-
-  template <typename T = LoDTensor>
-  vector<std::shared_ptr<Tensor>> Predict(const vector<Tensor> &ts,
-                                          const vector<string> &input_names,
-                                          const vector<string> &output_names,
-                                          const vector<DDim> &ddims) {
-    auto scope = this->program_.scope.get();
-    size_t input_size = input_names.size();
-    size_t out_size = output_names.size();
-
-    vector<Variable *> input_vars(input_size);
-    vector<LoDTensor *> input_tensors(input_size);
-    for (int i = 0; i < input_size; i++) {
-      input_vars[i] = scope->Var(input_names[i]);
-      input_tensors[i] = input_vars[i]->GetMutable<T>();
-      input_tensors[i]->ShareDataWith(ts[i]);
-    }
-
-    vector<Variable *> output_vars(out_size);
-    vector<LoDTensor *> output_tensors(out_size);
-    vector<std::shared_ptr<Tensor>> output_tensor_sptrs(out_size);
-
-    for (int i = 0; i < out_size; i++) {
-      output_vars[i] = scope->Var(output_names[i]);
-      output_tensors[i] = output_vars[i]->GetMutable<T>();
-      output_tensors[i]->mutable_data<float>(ddims[i]);
-      output_tensor_sptrs[i] = std::make_shared<LoDTensor>();
-      output_tensor_sptrs[i].reset(output_tensors[i]);
-    }
-
-    for (auto &op : this->ops_of_block0_) {
-      op->Run();
-    }
-
-    return output_tensor_sptrs;
-  }
-
-  std::shared_ptr<Tensor> Predict(const Tensor &t, string input, string output,
-                                  const DDim &dDim) {
-    auto scope = this->program_.scope.get();
-    Variable *g_feed_value = scope->Var(input);
-    auto tensor = g_feed_value->GetMutable<LoDTensor>();
-    tensor->ShareDataWith(t);
-
-    Variable *con_output = scope->Var(output);
-    auto *output_tensor = con_output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>(dDim);
-
-    for (auto &op : this->ops_of_block0_) {
-      op->Run();
-    }
-
-    return std::make_shared<paddle_mobile::framework::Tensor>(
-        paddle_mobile::framework::Tensor(*output_tensor));
-  }
-};
diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h
deleted file mode 100644
index 3a8af875928898135a55884df58e3067f146a4f2..0000000000000000000000000000000000000000
--- a/mobile/test/executor_for_test_opencl.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_MOBILE_CL
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "./test_helper.h"
-#include "common/log.h"
-#include "framework/cl/cl_helper.h"
-#include "framework/cl/cl_tensor.h"
-#include "framework/executor.h"
-#include "framework/op_registry.h"
-#include "operators/feed_op.h"
-#include "operators/fetch_op.h"
-
-using paddle_mobile::framework::AttributeMap;
-using paddle_mobile::framework::BlockDesc;
-using paddle_mobile::framework::DDim;
-using paddle_mobile::framework::Executor;
-using paddle_mobile::framework::LoDTensor;
-using paddle_mobile::framework::OpDesc;
-using paddle_mobile::framework::OperatorBase;
-using paddle_mobile::framework::Program;
-using paddle_mobile::framework::Tensor;
-using paddle_mobile::framework::Variable;
-using std::string;
-using std::vector;
-namespace paddle_mobile {
-template <typename OpType>
-class OpenClOpTester {
- public:
-  OpenClOpTester() {
-    framework::CLEngine::Instance()->setClPath("/data/local/tmp/bin");
-    scope_ = std::make_shared<paddle_mobile::framework::Scope>();
-    feed_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
-    fetch_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
-    this->feed_clhelper_.AddKernel("feed", "feed_kernel.cl");
-    this->fetch_clhelper_.AddKernel("fetch", "fetch_kernel.cl");
-
-    feed_var = scope_.get()->Var("feed");
-    fetch_var = scope_.get()->Var("fetch");
-    op_in_var = scope_.get()->Var("op_in");
-    op_out_var = scope_.get()->Var("op_out");
-  }
-
-  void Predict(string op_type, DDim feed_dims, DDim fetch_dims,
-               VariableNameMap inputs_feed, VariableNameMap outputs_feed,
-               AttributeMap attrs_feed) {
-    framework::CLImage *const op_in_cl_image =
-        op_in_var->template GetMutable<framework::CLImage>();
-    op_in_cl_image->Resize(feed_dims);
-    op_in_cl_image->InitEmptyImage(feed_clhelper_.CLContext(),
-                                   feed_clhelper_.CLCommandQueue(), feed_dims);
-    framework::CLImage *const op_out_cl_image =
-        op_out_var->template GetMutable<framework::CLImage>();
-    op_out_cl_image->Resize(fetch_dims);
-    framework::CLScope *const clScpoe = scope_->GetCLScpoe();
-    op_out_cl_image->InitEmptyImage(clScpoe->Context(), clScpoe->CommandQueue(),
-                                    fetch_dims);
-
-    Feed(feed_dims);
-    auto *op = new OpType(op_type, inputs_feed, outputs_feed, attrs_feed,
-                          scope_.get());
-    op->InferShape();
-    op->Init();
-    op->Run();
-    Fetch(fetch_dims);
-  }
-  void Feed(DDim feed_dims) {
-    auto *feed_var = scope_->Var("feed");
-    auto *_var = scope_->Var("op_in");
-    auto *const input = feed_var->template GetMutable<framework::LoDTensor>();
-    DLOG << "feed_dims: " << feed_dims;
-    SetupTensor<float>(input, feed_dims, -100.0, 100.0);
-    framework::CLImage *const op_in_cl_image =
-        op_in_var->template GetMutable<framework::CLImage>();
-    DLOG << "FeedKernel run ";
-    DLOG << "params.input " << *input;
-    DLOG << "params.op_in_cl_image " << *op_in_cl_image;
-    auto kernel = this->feed_clhelper_.KernelAt(0);
-    DLOG << "kernel get success ";
-
-    auto default_work_size =
-        this->feed_clhelper_.DefaultWorkSize(*(op_in_cl_image));
-
-    DLOG << "op_in_cl_image: " << *op_in_cl_image;
-    DLOG << "default_work_size: " << default_work_size;
-    cl_int status;
-    int numel = input->numel();
-    cl_mem output_image = op_in_cl_image->GetCLImage();
-    const int out_C = op_in_cl_image->dims()[1];
-    const int out_H = op_in_cl_image->dims()[2];
-    const int out_W = op_in_cl_image->dims()[3];
-    const int Stride2 = out_C * out_H * out_W;
-    const int Stride1 = out_H * out_W;
-    const int Stride0 = out_W;
-    framework::CLTensor input_cl_tensor(this->feed_clhelper_.CLContext(),
-                                        this->feed_clhelper_.CLCommandQueue());
-    input_cl_tensor.Resize(input->dims());
-    cl_mem inputBuffer;
-
-    inputBuffer =
-        input_cl_tensor.mutable_with_data<float>(input->data<float>());
-
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
-    CL_CHECK_ERRORS(status);
-
-    status = clEnqueueNDRangeKernel(
-        this->feed_clhelper_.CLCommandQueue(), kernel, default_work_size.size(),
-        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
-
-    CL_CHECK_ERRORS(status);
-
-    DLOG << "*op_in_cl_image: " << *op_in_cl_image;
-  }
-
-  void Fetch(DDim fetch_dims) {
-    DLOG << "------------------  Fetch op ---------------------";
-
-    DLOG << "------------------  Fetch op end ---------------------";
-  }
-
- private:
-  std::shared_ptr<paddle_mobile::framework::Scope> scope_;
-  framework::CLHelper feed_clhelper_;
-  framework::CLHelper fetch_clhelper_;
-
-  Variable *feed_var;
-  Variable *fetch_var;
-  Variable *op_in_var;
-  Variable *op_out_var;
-};
-}  // namespace paddle_mobile
-#endif
diff --git a/mobile/test/fpga/test_concat_op.cpp b/mobile/test/fpga/test_concat_op.cpp
deleted file mode 100644
index 44b9f4971bbd5cc69e1f663ae71e27e69c31a04b..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_concat_op.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/concat_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::FPGA> loader;
-  auto program = loader.Load(g_googlenet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::FPGA,
-                paddle_mobile::operators::ConcatOp<paddle_mobile::FPGA, float>>
-      executor(program, "concat");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
-  input_tensors.push_back(input1);
-  Tensor input2;
-  auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
-  input_tensors.push_back(input2);
-  Tensor input3;
-  auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
-  input_tensors.push_back(input3);
-  Tensor input4;
-  auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
-  input_tensors.push_back(input4);
-  // 2. input_names
-  vector<string> input_names({
-      "conv2d_3.tmp_1",
-      "conv2d_5.tmp_1",
-      "conv2d_7.tmp_1",
-      "conv2d_8.tmp_1",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"concat_0.tmp_0"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  // 5. test one example.
-  int input_n = 1;
-  int input_c = 2;
-  int input_h = 0;
-  int input_w = 1;
-  int stride0 = input3.numel() / input3.dims()[0];
-  int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
-  int stride2 = input3.dims()[3];
-  /// inputx1 (4,10,2,2),
-  /// inputx2 (4,20,2,2),
-  /// inputx3 (4,30,2,2),
-  /// inputx4 (4,40,2,2),
-  /// axis = 1
-  /// output (4,100,2,2)
-  int input_index =
-      input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
-  int output_index = input_n * 100 * 2 * 2 +
-                     (input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
-                     input_h * 2 + input_w;
-
-  DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
-  DLOG << " output [1,32,0,1] = " << output0_data[output_index];
-  return 0;
-}
diff --git a/mobile/test/fpga/test_densebox_combine.cpp b/mobile/test/fpga/test_densebox_combine.cpp
deleted file mode 100644
index 056bbe52d89f69a444174846e602b44d9c581d03..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_densebox_combine.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-static const char *g_densebox_combine = "../models/densebox";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  // paddle_mobile.SetThreadNum(4);
-  if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model",
-                         std::string(g_densebox_combine) + "/params", true)) {
-    // std::vector<float> input;
-    // std::vector<int64_t> dims{1, 3, 512, 1024};
-    // GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    // auto vec_result = paddle_mobile.Predict(input, dims);
-
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 512, 1024}, static_cast<float>(0),
-                       static_cast<float>(1));
-    // readStream(g_image_src_float,
-    //           input_tensor.mutable_data<float>({1, 3, 224, 224}));
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_format_data.cpp b/mobile/test/fpga/test_format_data.cpp
deleted file mode 100644
index 1d67c3110ff86dc6fba2d49412edb70ab1c9c16d..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_format_data.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "fpga/api.h"
-
-namespace frame = paddle_mobile::framework;
-namespace fpga = paddle_mobile::fpga;
-using std::cout;
-using std::endl;
-
-void test_format_image() {
-  std::vector<int> dims{1, 1, 3, 3};
-  std::vector<float> elements{1, 2, 3, 4, 5, 6, 7, 8, 9};
-  frame::DDim ddim = frame::make_ddim(dims);
-  frame::Tensor image(elements, ddim);
-  int num = image.numel();
-  float *data_ptr = image.mutable_data<float>();
-
-  for (int i = 0; i < num; i++) {
-    cout << data_ptr[i] << " ";
-  }
-  cout << endl;
-
-  fpga::format_image(&image);
-  data_ptr = image.mutable_data<float>();
-
-  for (int i = 0; i < 48; i++) {
-    cout << data_ptr[i] << " ";
-  }
-  cout << endl;
-  auto dd = image.dims();
-  cout << dims[0] << dims[1] << dims[2] << dims[3] << endl;
-}
-
-void test_fill_conv_arg() {
-  Tensor input, out, filter;
-  DLOG << "Setup input";
-  SetupTensor<int16_t>(&input, {1, 250, 32, 30}, static_cast<int16_t>(0),
-                       static_cast<int16_t>(1));
-
-  DLOG << "Setup filter";
-  SetupTensor<float>(&filter, {1001, 250, 3, 3}, static_cast<float>(0),
-                     static_cast<float>(1));
-
-  DLOG << "Setup output";
-  SetupTensor<int16_t>(&out, {1, 1001, 32, 30}, static_cast<int16_t>(0),
-                       static_cast<int16_t>(1));
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float));
-
-  DLOG << "find max";
-  float max_value = fpga::filter_find_max(&filter);
-  DLOG << "format filter";
-  fpga::format_filter(&filter, max_value, 1);
-
-  DLOG << "format bs_ptr";
-  int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1);
-  fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001);
-
-  DLOG << "format ofm";
-  fpga::format_fp16_ofm(&out);
-  DLOG << "Build arg";
-
-  fpga::WrapperConvArgs arg;
-  fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr);
-  DLOG << "splitNum: " << arg.split_num << "  group_num:" << arg.group_num
-       << "  filter_num:" << arg.filter_num;
-
-  for (int i = 0; i < arg.split_num; i++) {
-    DLOG << arg.conv_args[i].filter_num << "   " << arg.conv_args[i].sb_address
-         << "   " << arg.conv_args[i].filter_address << "   "
-         << arg.conv_args[i].filter_scale_address;
-  }
-}
-
-int main() {
-  test_format_image();
-  test_fill_conv_arg();
-  return 0;
-}
diff --git a/mobile/test/fpga/test_marker.cpp b/mobile/test/fpga/test_marker.cpp
deleted file mode 100644
index e0977b57f07980aaf573abf2a0a8834b36740f56..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_marker.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#include <fstream>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../models/marker/marker1/image.bin";
-static const char *g_model = "../models/marker/marker1/model";
-static const char *g_param = "../models/marker/marker1/params";
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  in.close();
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img.dtypeid = typeid(float);
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = typeid(float);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  for (int i = 0; i < v.size(); ++i) {
-    auto p = reinterpret_cast<float *>(v[i].data.data());
-    int len = v[i].data.length();
-    float result = 0.0f;
-    std::string str = "fetch" + std::to_string(i);
-    fpga::savefile<float>(str, p, len, result);
-  }
-
-  std::cout << "Finish getting vector values" << std::endl;
-
-  ////////////////////////////////////////////////////
-
-  // PaddleTensor tensor;
-  // predictor->GetPaddleTensor("fetch2", &tensor);
-  // for (int i = 0; i < post_nms; i++) {
-  // auto p = reinterpret_cast<float *>(tensor.data.data());
-  // std::cout << p[+i] << std::endl;
-  // }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_marker2.cpp b/mobile/test/fpga/test_marker2.cpp
deleted file mode 100644
index b4af515c7313325bffa7ba0ec465b985d6eb75b8..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_marker2.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-#include <string>
-#ifdef COST_TIME_PRINT
-#include <sys/time.h>
-#include <time.h>
-#include <iomanip>
-#endif
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  DLOG << length;
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int num, int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-}
-
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum, bool use_chw) {
-  // bool use_chw = true;
-  if (input_tensor.dims().size() != 4) return;
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  int n = (input_tensor.dims())[0];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  auto data_tmp = data_ptr_16;
-  if (use_chw) {
-    data_tmp =
-        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
-    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
-  }
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  if (data_tmp != data_ptr_16) {
-    free(data_tmp);
-  }
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
-                 bool use_chw) {
-  static int i = 0;
-  if (input_tensor.numel() == 0) {
-    return;
-  }
-  if (input_tensor.type() == typeid(float)) {
-    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
-    dump_stride_float(filename, input_tensor, dumpnum);
-  } else {
-    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
-    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
-  }
-  DLOG << "dump input address: " << input_tensor.get_data();
-}
-
-static const char *g_marker_combine = "../models/marker/marker_2segment";
-// static const char *g_marker_combine = "../models/marker/model2";
-static const char *g_image_src_float =
-    "../models/marker/marker_2segment/marker_2.bin";
-// static const char *g_image_src_float = "../models/marker/model2/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-
-  if (paddle_mobile.Load(std::string(g_marker_combine) + "/model",
-                         std::string(g_marker_combine) + "/params", true, false,
-                         1, true)) {
-    // if (paddle_mobile.Load(std::string(g_marker_combine), true)) {
-    float img_info[3] = {432, 1280, 1.0f};
-    auto img = reinterpret_cast<float *>(
-        fpga::fpga_malloc(144 * 14 * 14 * sizeof(float)));
-    readStream(g_image_src_float, reinterpret_cast<char *>(img));
-
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData({img});
-    // paddle_mobile.Predict_To(-1);
-#ifdef COST_TIME_PRINT
-    timeval start11, end11;
-    long dif_sec, dif_usec;  // NOLINT
-#endif
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&start11, NULL);
-#endif
-
-    paddle_mobile.Predict_To(-1);
-
-#ifdef COST_TIME_PRINT
-    gettimeofday(&end11, NULL);
-    dif_sec = end11.tv_sec - start11.tv_sec;
-    dif_usec = end11.tv_usec - start11.tv_usec;
-    std::cout << "total: "
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "  us"
-              << std::endl;
-#endif
-
-    for (int i = 0; i < 8; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "marker_" + std::to_string(i);
-      // if(i != 58)
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(float));
-      //                                   tensor_ptr->numel() * sizeof(float));
-
-      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
-                  true);  // 20);//tensor_ptr->numel());
-    }
-
-    //   paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_marker_api.cpp b/mobile/test/fpga/test_marker_api.cpp
deleted file mode 100644
index 19e051a38d2b853dda8a9364ac1ea64ae3f38acb..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_marker_api.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <sys/time.h>
-#include <time.h>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../models/marker/model/image.bin";
-static const char *g_model = "../models/marker/model/model";
-static const char *g_param = "../models/marker/model/params";
-
-static const char *g_image1 = "../models/marker2/model/marker.bin";
-static const char *g_model1 = "../models/marker2/model/model";
-static const char *g_param1 = "../models/marker2/model/params";
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  in.close();
-}
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-void quantize(float **data_in, int data_size) {
-  float *tmp = *data_in;
-  signed char *tmp_data =
-      (signed char *)paddle_mobile::fpga::fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  paddle_mobile::fpga::fpga_free(tmp);
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    float *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump_stride_float(std::string filename,
-                       paddle_mobile::PaddleTensor input_tensor) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
-  int c = (input_tensor.shape)[1];
-  int h = (input_tensor.shape)[2];
-  int w = (input_tensor.shape)[3];
-  int n = (input_tensor.shape)[0];
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
-  // convert_to_chw(&data_ptr, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int datasize = abs(c * h * w * n);
-  if (datasize == 0) {
-    std::cout << "wrong dump data size" << std::endl;
-    return;
-  }
-  for (int i = 0; i < datasize; i++) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename,
-                 paddle_mobile::PaddleTensor input_tensor) {
-  if (input_tensor.dtypeid == PaddlekTypeId_t::paddle_float) {
-    dump_stride_float(filename, input_tensor);
-  } else {
-    std::cout << "only support dumping float data" << std::endl;
-  }
-}
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-PaddleMobileConfig GetConfig1() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model1;
-  config.param_file = g_param1;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-  timeval start11, end11;
-  long dif_sec, dif_usec;  // NOLINT
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img_info.dtypeid = PaddlekTypeId_t::paddle_float;
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = PaddlekTypeId_t::paddle_float;
-  // quantize(&img, img_length);
-  // t_img.dtypeid = typeid(int8_t);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  // t_img.data.Reset(img, img_length * sizeof(int8_t));
-  // for(int i = 0; i < 100; ++i){
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  gettimeofday(&start11, NULL);
-  predictor->Predict_From_To(0, -1);
-  gettimeofday(&end11, NULL);
-  dif_sec = end11.tv_sec - start11.tv_sec;
-  dif_usec = end11.tv_usec - start11.tv_usec;
-  std::cout << "marker1 total"
-            << " cost time: " << (dif_sec * 1000000 + dif_usec) << "  us"
-            << std::endl;
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<paddle_mobile::PaddleTensor> v;  // No need to initialize v
-  predictor->FetchPaddleTensors(&v);           // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
-    std::string dumpName = "marker_api_fetch_" + std::to_string(fetchNum);
-    // dump_stride(dumpName, v[fetchNum]);
-  }
-  fpga_free(img);
-
-  PaddleMobileConfig config1 = GetConfig1();
-  auto predictor1 =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config1);
-
-  std::cout << "Finishing loading model" << std::endl;
-  for (int i = 0; i < 1; ++i) {
-    int img_length1 = 144 * 14 * 14;
-    auto img1 =
-        reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
-    readStream(g_image1, reinterpret_cast<char *>(img1));
-
-    std::cout << "Finishing initializing data" << std::endl;
-    struct PaddleTensor t_img1;
-
-    t_img1.dtypeid = PaddlekTypeId_t::paddle_float;
-    t_img1.layout = LAYOUT_HWC;
-    t_img1.shape = std::vector<int>({1, 14, 14, 144});
-    t_img1.name = "Image information";
-    t_img1.data.Reset(img1, img_length1 * sizeof(float));
-    predictor1->FeedPaddleTensors({t_img1});
-
-    std::cout << "Finishing feeding data " << std::endl;
-
-    gettimeofday(&start11, NULL);
-    predictor1->Predict_From_To(0, -1);
-    gettimeofday(&end11, NULL);
-    dif_sec = end11.tv_sec - start11.tv_sec;
-    dif_usec = end11.tv_usec - start11.tv_usec;
-    std::cout << "marker2 total"
-              << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "  us"
-              << std::endl;
-    std::cout << "Finishing predicting " << std::endl;
-
-    std::vector<paddle_mobile::PaddleTensor> v1;  // No need to initialize v
-    predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
-    std::cout << "Output number is " << v1.size() << std::endl;
-    for (int fetchNum = 0; fetchNum < v1.size(); fetchNum++) {
-      std::string dumpName = "marker2_api_fetch_" + std::to_string(fetchNum);
-      dump_stride(dumpName, v1[fetchNum]);
-    }
-    fpga_free(img1);
-  }
-  return 0;
-}
diff --git a/mobile/test/fpga/test_mobilenet_api.cpp b/mobile/test/fpga/test_mobilenet_api.cpp
deleted file mode 100644
index 5c0a594ca8c4692b7c0a07afb72bf260b3c6086d..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_mobilenet_api.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../images/mobilenet_txtdata/1.txt";
-static const char *g_model = "../models/keycurve_l2_regular4_model/__model__";
-static const char *g_param =
-    "../models/keycurve_l2_regular4_model/model.params";
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-void quantize(float **data_in, int data_size) {
-  float *tmp = *data_in;
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    float *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
-  int c = (input_tensor.shape)[1];
-  int h = (input_tensor.shape)[2];
-  int w = (input_tensor.shape)[3];
-  int n = (input_tensor.shape)[0];
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
-  convert_to_chw(&data_ptr, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int datasize = abs(c * h * w * n);
-  if (datasize == 0) {
-    std::cout << "wrong dump data size" << std::endl;
-    return;
-  }
-  for (int i = 0; i < datasize; i++) {
-    result = data_tmp[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, PaddleTensor input_tensor) {
-  if (input_tensor.dtypeid == PaddlekTypeId_t::paddle_float) {
-    dump_stride_float(filename, input_tensor);
-  } else {
-    std::cout << "only support dumping float data" << std::endl;
-  }
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-int main() {
-  open_device();
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<paddle_mobile::PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-  int img_length = 256 * 416 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, img);
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img;
-  t_img.dtype = FLOAT32;
-  t_img.dtypeid = PaddlekTypeId_t::paddle_float;
-  // quantize(&img, img_length);
-  // t_img.dtype = INT8;
-  // t_img.dtypeid = typeid(int8_t);
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 256, 416, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  // t_img.data.Reset(img, img_length * sizeof(int8_t));
-  predictor->FeedPaddleTensors({t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
-    std::string dumpName = "mobilenet_api_fetch_" + std::to_string(fetchNum);
-    dump_stride(dumpName, v[fetchNum]);
-  }
-  return 0;
-}
diff --git a/mobile/test/fpga/test_pe.cpp b/mobile/test/fpga/test_pe.cpp
deleted file mode 100644
index f5f2708b9e628af80433be4e7ccbb205d3fcd6f6..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_pe.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#include "fpga/V2/filter.h"
-
-namespace fpga = paddle_mobile::fpga;
-
-static const uint32_t N = 64;
-static const uint32_t C = 3;
-static const uint32_t H = 224;
-static const uint32_t W = 224;
-static const uint32_t G = 1;
-
-fpga::DataType input_type = fpga::DATA_TYPE_FP32;
-fpga::DataType output_type = fpga::DATA_TYPE_FP16;
-
-void* ifm = nullptr;
-void* ofm = nullptr;
-void* filter = nullptr;
-void* ifm_scale = nullptr;
-void* ofm_scale = nullptr;
-void* filter_scale = nullptr;
-
-int ifm_size = 0, ofm_size = 0;
-
-void format_data() {
-  ifm_scale = fpga::fpga_malloc(8);
-  ofm_scale = fpga::fpga_malloc(8);
-  int ifm_channel = fpga::filter::calc_aligned_channel(C);
-  int ofm_channel = fpga::filter::calc_aligned_channel(N);
-  int num = fpga::filter::calc_aligned_num(N, C);
-  DLOG << "ifm_channel = " << ifm_channel;
-  DLOG << "ofm_channel = " << ofm_channel;
-  DLOG << "aligned_num = " << num;
-  ifm_size = ifm_channel * H * W;
-  ofm_size = ofm_channel * H * W;
-  ifm = fpga::fpga_malloc(ifm_size * sizeof(float));
-  ofm = fpga::fpga_malloc(ofm_size * sizeof(int16_t));
-  memset(ifm, 0, ifm_size * sizeof(float));
-  memset(ofm, 0, ofm_size * sizeof(int16_t));
-
-  for (int h = 0; h < H; h++) {
-    for (int w = 0; w < W; w++) {
-      for (int c = 0; c < C; c++) {
-        int index = h * W * ifm_channel + w * ifm_channel + c;
-        (reinterpret_cast<float*>(ifm))[index] = h + w + c * 0.1f;
-        // DLOG << index << ":" << ((float *) ifm)[index];
-      }
-    }
-  }
-  fpga::fpga_flush(ifm, ifm_size * sizeof(float));
-  fpga::fpga_flush(ofm, ofm_size * sizeof(int16_t));
-}
-
-void print_fp16(int16_t* ptr, int total_size, int num) {
-  fpga::fpga_invalidate(ptr, total_size * sizeof(int16_t));
-  int stride = total_size / num;
-  for (int i = 0; i < total_size; i += stride) {
-    DLOG << fpga::fp16_2_fp32(ptr[i]);
-  }
-}
-
-void print_fp32(float* ptr, int total_size, int num) {
-  fpga::fpga_invalidate(ptr, total_size * sizeof(float));
-  int stride = total_size / num;
-  for (int i = 0; i < total_size; i += stride) {
-    DLOG << ptr[i];
-  }
-}
-
-void test_bypass() {
-  fpga::BypassArgs args;
-  args.input_data_type = input_type;
-  args.output_data_type = output_type;
-  args.image.address = ifm;
-  args.image.height = H;
-  args.image.width = W;
-  args.image.channels = C;
-  args.image.scale_address = reinterpret_cast<float*>(ifm_scale);
-  args.output.address = ofm;
-  args.output.scale_address = reinterpret_cast<float*>(ofm_scale);
-  fpga::PerformBypass(args);
-}
-
-int main() {
-  paddle_mobile::fpga::open_device();
-  format_data();
-  DLOG << "format data done";
-  print_fp32(reinterpret_cast<float*>(ifm), ifm_size, 200);
-  DLOG << "print input done";
-  test_bypass();
-  DLOG << "test done";
-  print_fp16(reinterpret_cast<int16_t*>(ofm), ifm_size, 200);
-  std::cout << "Computation done" << std::endl;
-  return 0;
-}
-
-#endif
diff --git a/mobile/test/fpga/test_resnet50.cpp b/mobile/test/fpga/test_resnet50.cpp
deleted file mode 100644
index e48ad33f36cdee1e57ffba9bf64c6546691f0566..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_resnet50.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  string strOne;
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump(std::string filename, Tensor input_tensor) {
-  auto dataptr = reinterpret_cast<half *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  for (int i = 0; i < input_tensor.numel(); ++i) {
-    result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]);
-    out << result << std::endl;
-  }
-  out.close();
-}
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum) {
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_tmp =
-      reinterpret_cast<half *>(malloc(c * h * w * sizeof(int16_t)));
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  convert_to_chw(&data_ptr_16, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  free(data_tmp);
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-static const char *g_resnet50 = "../models/resnet50";
-const std::string g_image_src_float = "../images/image_src_float";  // NOLINT
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  if (paddle_mobile.Load(std::string(g_resnet50), true)) {
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(2),
-                       static_cast<float>(2));
-    readStream(g_image_src_float,
-               input_tensor.mutable_data<float>({1, 3, 224, 224}));
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-    for (int i = 0; i < 73; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "resnet50_result_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(half));
-      // dump_stride_half(saveName, (*tensor_ptr), 20);
-      // dump(saveName, (*tensor_ptr));
-    }
-
-    auto tensor_ptr = paddle_mobile.FetchResult(73);
-    // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20);
-    tensor_ptr = paddle_mobile.FetchResult(74);
-    // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999);
-
-    float max = 0;
-    auto data_ptr = tensor_ptr->data<float>();
-    int maximumIdx = 0;
-    for (int i = 0; i < (*tensor_ptr).numel(); i++) {
-      if (data_ptr[i] > max) {
-        maximumIdx = i;
-        max = data_ptr[i];
-      }
-    }
-    std::cout << "index : " << std::dec << maximumIdx << ",    value : " << max
-              << std::endl;
-    std::cout << "Computation done" << std::endl;
-    return 0;
-  }
-}
diff --git a/mobile/test/fpga/test_rfcn.cpp b/mobile/test/fpga/test_rfcn.cpp
deleted file mode 100644
index 50f8aa863d45c3c118f60367bf7b9921e0667891..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_rfcn.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-#ifdef PADDLE_MOBILE_FPGA_V1
-#include "fpga/V1/api.h"
-#endif
-#ifdef PADDLE_MOBILE_FPGA_V2
-#include "fpga/V2/api.h"
-#endif
-
-#include <string>
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  DLOG << length;
-  in.close();
-}
-
-void convert_to_chw(int16_t **data_in, int channel, int height, int width,
-                    int num, int16_t *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int n = 0; n < num; n++) {
-    for (int h = 0; h < height; h++) {
-      for (int w = 0; w < width; w++) {
-        for (int c = 0; c < channel; c++) {
-          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
-            width * h + w) = *((*data_in)++);
-        }
-      }
-    }
-  }
-}
-
-void dump_stride_half(std::string filename, Tensor input_tensor,
-                      const int dumpnum, bool use_chw) {
-  // bool use_chw = true;
-  if (input_tensor.dims().size() != 4) return;
-  int c = (input_tensor.dims())[1];
-  int h = (input_tensor.dims())[2];
-  int w = (input_tensor.dims())[3];
-  int n = (input_tensor.dims())[0];
-  auto data_ptr = input_tensor.get_data();
-  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
-  auto data_tmp = data_ptr_16;
-  if (use_chw) {
-    data_tmp =
-        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
-    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
-  }
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
-    out << result << std::endl;
-  }
-  out.close();
-  if (data_tmp != data_ptr_16) {
-    free(data_tmp);
-  }
-}
-
-void dump_stride_float(std::string filename, Tensor input_tensor,
-                       const int dumpnum) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int stride = input_tensor.numel() / dumpnum;
-  stride = stride > 0 ? stride : 1;
-  for (int i = 0; i < input_tensor.numel(); i += stride) {
-    result = data_ptr[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
-                 bool use_chw) {
-  static int i = 0;
-  if (input_tensor.numel() == 0) {
-    return;
-  }
-  if (input_tensor.type() == typeid(float)) {
-    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
-
-    dump_stride_float(filename, input_tensor, dumpnum);
-  } else {
-    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
-
-    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
-  }
-  DLOG << "dump input address: " << input_tensor.get_data();
-}
-
-static const char *g_rfcn_combine = "../models/rfcn";
-static const char *g_image_src_float = "../models/rfcn/data.bin";
-int main() {
-  paddle_mobile::fpga::open_device();
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-
-  if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
-                         std::string(g_rfcn_combine) + "/params", true, false,
-                         1, true)) {
-    float img_info[3] = {768, 1536, 768.0f / 960.0f};
-    auto img = reinterpret_cast<float *>(
-        fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)));
-    readStream(g_image_src_float, reinterpret_cast<char *>(img));
-
-    std::vector<void *> v(3, nullptr);
-    paddle_mobile.FeedData(std::vector<void *>({img_info, img}));
-    paddle_mobile.Predict_To(-1);
-
-    for (int i = 65; i < 69; i++) {
-      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "rfcn_" + std::to_string(i);
-      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
-                                           tensor_ptr->numel() * sizeof(float));
-      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), true);
-    }
-    //   paddle_mobile.GetResults(&v);
-    DLOG << "Computation done";
-    fpga::fpga_free(img);
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_rfcn_api.cpp b/mobile/test/fpga/test_rfcn_api.cpp
deleted file mode 100644
index b8b031bf59a47b7ac9a0f71b828596aaad15e3f1..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_rfcn_api.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../models/rfcn/data.bin";
-static const char *g_model = "../models/rfcn/model";
-static const char *g_param = "../models/rfcn/params";
-
-void readStream(std::string filename, char *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in | std::ios::binary);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-
-  in.seekg(0, std::ios::end);  // go to the end
-  auto length = in.tellg();    // report location (this is the length)
-  in.seekg(0, std::ios::beg);  // go back to the beginning
-  in.read(buf, length);
-  in.close();
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-PaddleMobileConfig GetConfig1() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.model_dir = "../models/resnet50";
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-#if 0
-  PaddleMobileConfig config1 = GetConfig1();
-  auto predictor1 =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config1);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  int img_length1 = 224 * 224 * 3;
-  auto img1 =
-      reinterpret_cast<float *>(fpga_malloc(img_length1 * sizeof(float)));
-
-  std::cout << "Finishing initializing data" << std::endl;
-
-  struct PaddleTensor t_img1;
-
-  t_img1.dtypeid = type_id<float>().hash_code();
-  t_img1.layout = LAYOUT_HWC;
-  t_img1.shape = std::vector<int>({1, 224, 224, 3});
-  t_img1.name = "Image information";
-  t_img1.data.Reset(img1, img_length1 * sizeof(float));
-  predictor1->FeedPaddleTensors({t_img1});
-  predictor1->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v1;         // No need to initialize v
-  predictor1->FetchPaddleTensors(&v1);  // Old data in v will be cleared
-  std::cout << "Output number is " << v1.size() << std::endl;
-  std::cout << "out[0] length " << v1[0].data.length() << std::endl;
-  fpga_free(img1);
-#endif
-  ////////////////////////////
-
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-
-  float img_info[3] = {432, 1280, 1.0f};
-  int img_length = 432 * 1280 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, reinterpret_cast<char *>(img));
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img_info, t_img;
-  t_img.dtypeid = PaddlekTypeId_t::paddle_float;
-  t_img_info.layout = LAYOUT_HWC;
-  t_img_info.shape = std::vector<int>({1, 3});
-  t_img_info.name = "Image information";
-  t_img_info.data.Reset(img_info, 3 * sizeof(float));
-
-  t_img.dtypeid = PaddlekTypeId_t::paddle_float;
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 432, 1280, 3});
-  t_img.name = "Image information";
-  t_img.data.Reset(img, img_length * sizeof(float));
-  predictor->FeedPaddleTensors({t_img_info, t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  std::cout << "out[0] length " << v[0].data.length() << std::endl;
-  std::cout << "out[1] length " << v[1].data.length() << std::endl;
-  std::cout << "out[2] length " << v[2].data.length() << std::endl;
-
-  auto post_nms = v[0].data.length() / sizeof(float) / 8;
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[0].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
-  }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 8; i++) {
-      auto p = reinterpret_cast<float *>(v[1].data.data());
-      std::cout << p[num * 8 + i] << std::endl;
-    }
-  }
-  for (int num = 0; num < post_nms; num++) {
-    for (int i = 0; i < 4; i++) {
-      auto p = reinterpret_cast<float *>(v[2].data.data());
-      std::cout << p[num * 4 + i] << std::endl;
-    }
-  }
-  std::cout << "Finish getting vector values" << std::endl;
-  fpga_free(img);
-
-  auto version = fpga::paddle_mobile_version();
-
-  std::cout << "0X0" << std::hex << version << std::endl;
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_ssd.cpp b/mobile/test/fpga/test_ssd.cpp
deleted file mode 100644
index c6d2b51a8c43d508b06dc97e3d2f3b95a0c6290b..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_ssd.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <typeindex>
-#include <typeinfo>
-#include "../test_include.h"
-
-#include "fpga/KD/float16.hpp"
-#include "fpga/KD/llapi/zynqmp_api.h"
-
-static const char* g_ssd = "../models/resnet50";
-
-int main() {
-  zynqmp::open_device();
-
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  std::string dir = std::string(g_ssd);
-  std::string model = std::string(g_ssd) + "/model";
-  std::string params = std::string(g_ssd) + "/params";
-
-  // if (paddle_mobile.Load(dir, true)) {
-  if (paddle_mobile.Load(model, params, true)) {
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(1),
-                       static_cast<float>(1));
-    float* data = input_tensor.mutable_data<float>({1, 3, 224, 224});
-
-    paddle_mobile.Predict(input_tensor);
-    auto result_ptr = paddle_mobile.Fetch();
-    float* result_data = result_ptr->data<float>();
-  }
-  return 0;
-}
diff --git a/mobile/test/fpga/test_tensor_quant.cpp b/mobile/test/fpga/test_tensor_quant.cpp
deleted file mode 100644
index 6cfc27e91ced109e41bf5420649dbb762ee94d66..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_tensor_quant.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(g_resnet, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 3, 32, 32};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热一次
-    paddle_mobile.Predict(input, dims);
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/fpga/test_yolo_api.cpp b/mobile/test/fpga/test_yolo_api.cpp
deleted file mode 100644
index 161d695418654b2198a59889ee44583901d25c2b..0000000000000000000000000000000000000000
--- a/mobile/test/fpga/test_yolo_api.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_MOBILE_FPGA
-#define PADDLE_MOBILE_FPGA
-#endif
-#include <fstream>
-#include <iostream>
-#include "../../src/io/paddle_inference_api.h"
-
-using namespace paddle_mobile;        // NOLINT
-using namespace paddle_mobile::fpga;  // NOLINT
-
-static const char *g_image = "../images/yolo_test_txtimg/1.txt";
-static const char *g_model = "../models/yolo_bn_l2_model/__model__";
-static const char *g_param = "../models/yolo_bn_l2_model/model.params";
-
-void readStream(std::string filename, float *buf) {
-  std::ifstream in;
-  in.open(filename, std::ios::in);
-  if (!in.is_open()) {
-    std::cout << "open File Failed." << std::endl;
-    return;
-  }
-  int i = 0;
-  while (!in.eof()) {
-    in >> buf[i];
-    i++;
-  }
-  in.close();
-}
-
-signed char float_to_int8(float fdata) {
-  if (fdata < 0.0) {
-    fdata -= 0.5;
-  } else {
-    fdata += 0.5;
-  }
-  return (signed char)fdata;
-}
-void quantize(float **data_in, int data_size) {
-  float *tmp = *data_in;
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8((*data_in)[i] + 128);
-  }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
-}
-
-void convert_to_chw(float **data_in, int channel, int height, int width,
-                    float *data_tmp) {
-  int64_t amount_per_side = width * height;
-  for (int h = 0; h < height; h++) {
-    for (int w = 0; w < width; w++) {
-      for (int c = 0; c < channel; c++) {
-        *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++);
-      }
-    }
-  }
-}
-
-void dump_stride_float(std::string filename, PaddleTensor input_tensor) {
-  auto data_ptr = reinterpret_cast<float *>(input_tensor.data.data());
-  int c = (input_tensor.shape)[1];
-  int h = (input_tensor.shape)[2];
-  int w = (input_tensor.shape)[3];
-  int n = (input_tensor.shape)[0];
-  float *data_tmp =
-      reinterpret_cast<float *>(malloc(c * h * w * sizeof(float)));
-  convert_to_chw(&data_ptr, c, h, w, data_tmp);
-  std::ofstream out(filename.c_str());
-  float result = 0;
-  int datasize = abs(c * h * w * n);
-  if (datasize == 0) {
-    std::cout << "wrong dump data size" << std::endl;
-    return;
-  }
-  for (int i = 0; i < datasize; i++) {
-    result = data_tmp[i];
-    out << result << std::endl;
-  }
-  out.close();
-}
-
-void dump_stride(std::string filename, PaddleTensor input_tensor) {
-  if (input_tensor.dtypeid == PaddlekTypeId_t::paddle_float) {
-    dump_stride_float(filename, input_tensor);
-  } else {
-    std::cout << "only support dumping float data" << std::endl;
-  }
-}
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kFPGA;
-  config.prog_file = g_model;
-  config.param_file = g_param;
-  config.thread_num = 1;
-  config.batch_size = 1;
-  config.optimize = true;
-  config.lod_mode = true;
-  config.quantification = false;
-  return config;
-}
-
-int main() {
-  open_device();
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  std::cout << "Finishing loading model" << std::endl;
-  int img_length = 256 * 416 * 3;
-  auto img = reinterpret_cast<float *>(fpga_malloc(img_length * sizeof(float)));
-  readStream(g_image, img);
-
-  std::cout << "Finishing initializing data" << std::endl;
-  struct PaddleTensor t_img;
-  // t_img.dtype = FLOAT32;
-  // t_img.dtypeid = type_id<float>().hash_code();
-  quantize(&img, img_length);
-  t_img.dtype = INT8;
-  t_img.dtypeid = PaddlekTypeId_t::paddle_int8_t;
-  t_img.layout = LAYOUT_HWC;
-  t_img.shape = std::vector<int>({1, 256, 416, 3});
-  t_img.name = "Image information";
-  // t_img.data.Reset(img, img_length * sizeof(float));
-  t_img.data.Reset(img, img_length * sizeof(int8_t));
-  predictor->FeedPaddleTensors({t_img});
-
-  std::cout << "Finishing feeding data " << std::endl;
-
-  predictor->Predict_From_To(0, -1);
-  std::cout << "Finishing predicting " << std::endl;
-
-  std::vector<PaddleTensor> v;        // No need to initialize v
-  predictor->FetchPaddleTensors(&v);  // Old data in v will be cleared
-  std::cout << "Output number is " << v.size() << std::endl;
-  for (int fetchNum = 0; fetchNum < v.size(); fetchNum++) {
-    std::string dumpName = "yolo_api_fetch_" + std::to_string(fetchNum);
-    dump_stride(dumpName, v[fetchNum]);
-  }
-  return 0;
-}
diff --git a/mobile/test/framework/test_inference_api.cpp b/mobile/test/framework/test_inference_api.cpp
deleted file mode 100644
index e1713bb203dc011f0fd7c48ff3b736f48d56eb44..0000000000000000000000000000000000000000
--- a/mobile/test/framework/test_inference_api.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "io/paddle_inference_api.h"
-
-using namespace paddle_mobile;
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kCPU;
-  config.model_dir = "../models/mobilenet/";
-  config.thread_num = 4;
-  return config;
-}
-
-int main() {
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  float data[1 * 3 * 224 * 224] = {1.0f};
-
-  PaddleTensor tensor;
-  tensor.shape = std::vector<int>({1, 3, 224, 224});
-  tensor.data = PaddleBuf(data, sizeof(data));
-  tensor.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> paddle_tensor_feeds(1, tensor);
-
-  PaddleTensor tensor_out;
-  tensor_out.shape = std::vector<int>({});
-  tensor_out.data = PaddleBuf();
-  tensor_out.dtype = PaddleDType::FLOAT32;
-  std::vector<PaddleTensor> outputs(1, tensor_out);
-
-  std::cout << " before predict " << std::endl;
-
-  predictor->Run(paddle_tensor_feeds, &outputs);
-
-  std::cout << " after predict " << std::endl;
-  //  assert();
-
-  float* data_o = static_cast<float*>(outputs[0].data.data());
-  for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) {
-    std::cout << "output[" << j << "]: " << data_o[j] << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/framework/test_load.cpp b/mobile/test/framework/test_load.cpp
deleted file mode 100644
index ed74b63497aadbd3f82e4e43b61b819fe34eba55..0000000000000000000000000000000000000000
--- a/mobile/test/framework/test_load.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <string>
-
-#include "../test_helper.h"
-#include "framework/loader.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-
-  std::string g_super = "../models/superresoltion";
-  //  auto program = loader.Load(g_super, true);
-
-  auto program = loader.Load(std::string(g_super) + "/model",
-                             std::string(g_super) + "/params", false);
-  //  program.originProgram->Description("program desc: ");
-
-  return 0;
-}
diff --git a/mobile/test/framework/test_load_memory.cpp b/mobile/test/framework/test_load_memory.cpp
deleted file mode 100644
index afab17d5e7e01d4060cbe92ea3228eb267d2bf32..0000000000000000000000000000000000000000
--- a/mobile/test/framework/test_load_memory.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-  fseek(fp, 0, SEEK_END);
-  auto size = static_cast<size_t>(ftell(fp));
-  rewind(fp);
-  DLOG << "model size: " << size;
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-static char *Get_binary_data(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  auto *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  auto model_path = std::string(g_genet_combine) + "/model";
-  auto params_path = std::string(g_genet_combine) + "/params";
-  uint8_t *bufModel = nullptr;
-  size_t sizeBuf = ReadBuffer(model_path.c_str(), &bufModel);
-  uint8_t *bufParams = nullptr;
-
-  std::cout << "sizeBuf: " << sizeBuf << std::endl;
-  size_t sizeParams = ReadBuffer(params_path.c_str(), &bufParams);
-  std::cout << "sizeParams: " << sizeParams << std::endl;
-
-  paddle_mobile.LoadCombinedMemory(sizeBuf, bufModel, sizeParams, bufParams);
-  return 0;
-}
diff --git a/mobile/test/framework/test_load_memory_inference_api.cpp b/mobile/test/framework/test_load_memory_inference_api.cpp
deleted file mode 100644
index 5b2773f8f1a21c3b9253b34fc5c18cd64ece27e7..0000000000000000000000000000000000000000
--- a/mobile/test/framework/test_load_memory_inference_api.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-
-#include <iostream>
-#include "../test_helper.h"
-#include "io/paddle_inference_api.h"
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-  FILE *fp;
-  fp = fopen(file_name, "rb");
-  PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-  fseek(fp, 0, SEEK_END);
-  auto size = static_cast<size_t>(ftell(fp));
-  rewind(fp);
-  DLOG << "model size: " << size;
-  *out = reinterpret_cast<uint8_t *>(malloc(size));
-  size_t cur_len = 0;
-  size_t nread;
-  while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-    cur_len += nread;
-  }
-  fclose(fp);
-  return cur_len;
-}
-
-static char *Get_binary_data(std::string filename) {
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  auto *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-
-paddle_mobile::PaddleMobileConfig GetConfig() {
-  paddle_mobile::PaddleMobileConfig config;
-  config.precision = paddle_mobile::PaddleMobileConfig::FP32;
-  config.device = paddle_mobile::PaddleMobileConfig::kGPU_CL;
-  const std::shared_ptr<paddle_mobile::PaddleModelMemoryPack> &memory_pack =
-      std::make_shared<paddle_mobile::PaddleModelMemoryPack>();
-  auto model_path = std::string(g_mobilenet_combined) + "/model";
-  auto params_path = std::string(g_mobilenet_combined) + "/params";
-  memory_pack->model_size =
-      ReadBuffer(model_path.c_str(), &memory_pack->model_buf);
-  std::cout << "sizeBuf: " << memory_pack->model_size << std::endl;
-  memory_pack->combined_params_size =
-      ReadBuffer(params_path.c_str(), &memory_pack->combined_params_buf);
-  std::cout << "sizeParams: " << memory_pack->combined_params_size << std::endl;
-  memory_pack->from_memory = true;
-  config.memory_pack = *memory_pack;
-  config.thread_num = 4;
-  return config;
-}
-int main() {
-  paddle_mobile::PaddleMobileConfig config = GetConfig();
-  auto predictor = paddle_mobile::CreatePaddlePredictor<
-      paddle_mobile::PaddleMobileConfig,
-      paddle_mobile::PaddleEngineKind::kPaddleMobile>(config);
-  return 0;
-}
diff --git a/mobile/test/framework/test_optimize.cpp b/mobile/test/framework/test_optimize.cpp
deleted file mode 100644
index 0392020789096e921865afed0b0fc51fa5999c6b..0000000000000000000000000000000000000000
--- a/mobile/test/framework/test_optimize.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "framework/loader.h"
-#include "framework/program/program-optimize/node.h"
-#include "framework/program/program-optimize/program_optimize.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //    "../../../test/models/googlenet"
-  auto program = loader.Load(g_mobilenet_ssd, true);
-  paddle_mobile::framework::ProgramOptimize optimize;
-  //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FusionOptimize(program.originProgram);
-  if (optimize_program != nullptr) {
-    //    optimize_program->Description("optimize");
-  } else {
-    LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_alexnet.cpp b/mobile/test/net/test_alexnet.cpp
deleted file mode 100644
index 50053fe82f95177fd786c1c8f8f5c9b7a521b888..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_alexnet.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_alexnet, true);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_benchmark.cpp b/mobile/test/net/test_benchmark.cpp
deleted file mode 100644
index 19d37eeded75dbcb8c519ab3b27295f7126159c7..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_benchmark.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <sstream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char* argv[]) {
-  if (argc < 4) {
-    std::cout << "Usage: " << std::endl
-              << "./test_benchmark fluid_model feed_shape thread_num [use_fuse]"
-              << std::endl;
-    std::cout << "use_fuse: optional, bool, default is 1\n";
-    return 1;
-  }
-  bool optimize = true;
-  char* fluid_model = argv[1];
-  char* feed_shape = argv[2];
-  int thread_num = atoi(argv[3]);
-  if (argc == 5) {
-    optimize = atoi(argv[4]);
-  }
-
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(thread_num);
-  auto time1 = time();
-  //  if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) {
-  if (paddle_mobile.Load(std::string(fluid_model) + "/model",
-                         std::string(fluid_model) + "/params", optimize, false,
-                         1, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms\n";
-    paddle_mobile::framework::Tensor input;
-    std::shared_ptr<paddle_mobile::framework::Tensor> output;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    if (feed_shape) {
-      sscanf(feed_shape, "%lld,%lld,%lld,%lld", &dims[0], &dims[1], &dims[2],
-             &dims[3]);
-    }
-    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
-              << dims[2] << ", " << dims[3] << "]\n";
-    paddle_mobile::framework::DDim in_shape =
-        paddle_mobile::framework::make_ddim(dims);
-    SetupTensor<float>(&input, in_shape, 0.f, 255.f);
-    // warmup
-    for (int i = 0; i < 2; ++i) {
-      paddle_mobile.Predict(input);
-    }
-    auto time3 = time();
-    int test_count = 100;
-    for (int i = 0; i < test_count; ++i) {
-      paddle_mobile.Predict(input);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / test_count
-              << "ms\n";
-    std::ostringstream os("output tensor size: ");
-    output = paddle_mobile.Fetch();
-    os << output->numel() << "\n" << output->data<float>()[0];
-    for (int i = 1; i < output->numel(); ++i) {
-      os << ", " << output->data<float>()[i];
-    }
-    std::string output_str = os.str();
-    //    std::cout << output_str << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_eng.cpp b/mobile/test/net/test_eng.cpp
deleted file mode 100644
index 67b13f12424e59e310b30f2ec7eb3d398bd774ad..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_eng.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_CPU
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  //    paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_eng) + "/model",
-                         std::string(g_eng) + "/params", true, false, 1,
-                         true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 1, 48, 400};
-    LoDTensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 1, 48, 400}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    //   预热十次
-    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.Predict(input_tensor);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.Predict(input_tensor);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_genet_combine.cpp b/mobile/test/net/test_genet_combine.cpp
deleted file mode 100644
index e6b0505a670f1a58ed7d09cc4854ef52b05b0649..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_genet_combine.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_genet_combine) + "/model",
-                         std::string(g_genet_combine) + "/params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 128, 128};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    // 预热一次
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  std::cout
-      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
-      << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_gesture.cpp b/mobile/test/net/test_gesture.cpp
deleted file mode 100644
index 596d50350e6fbeaabd4c42b1195f586b96ad9a69..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_gesture.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-const int max_run_times = 10;
-
-int main(int argc, char **argv) {
-  if (argc < 3) {
-    std::cerr
-        << "Usage: ./test_ocr [detect_model_dir|recog_model_dir] image_path"
-        << std::endl;
-    return 1;
-  }
-  std::string model_dir = argv[1];
-  std::string image_path = argv[2];
-
-  // init input, output params
-  std::vector<float> input_vec;
-  std::vector<int64_t> input_shape;
-  std::vector<std::string> output_fetch_nodes;
-  int PRINT_NODE_ELEM_NUM = 10;
-
-  input_shape.emplace_back(1);
-  input_shape.emplace_back(3);
-  input_shape.emplace_back(192);
-  input_shape.emplace_back(192);
-  output_fetch_nodes.emplace_back("detection_output_0.tmp_0");
-  std::shared_ptr<framework::LoDTensor> outputs[output_fetch_nodes.size()];
-
-  // init paddle instance
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "start load " << std::endl;
-  auto load_success = paddle_mobile.Load(std::string(model_dir) + "/model",
-                                         std::string(model_dir) + "/params",
-                                         true, false, 1, true);
-  std::cout << "load_success:" << load_success << std::endl;
-  // input image raw tensor, generated by
-  // [scripts](tools/python/imagetools/img2nchw.py)
-  std::cout << "image_path: " << image_path << std::endl;
-  std::cout << "input_shape: " << input_shape[0] << ", " << input_shape[1]
-            << ", " << input_shape[2] << ", " << input_shape[3] << std::endl;
-  GetInput<float>(image_path, &input_vec, input_shape);
-
-  // model predict
-  auto pred_start_time = paddle_mobile::time();
-  for (int run_idx = 0; run_idx < max_run_times; ++run_idx) {
-    paddle_mobile.Predict(input_vec, input_shape);
-    for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-      auto fetch_name = output_fetch_nodes[out_idx];
-      outputs[out_idx] = paddle_mobile.Fetch(fetch_name);
-    }
-  }
-  auto pred_end_time = paddle_mobile::time();
-
-  // inference time
-  double pred_time =
-      paddle_mobile::time_diff(pred_start_time, pred_end_time) / max_run_times;
-  std::cout << "predict time(ms): " << pred_time << std::endl;
-
-  // output result
-  for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-    std::string node_id = output_fetch_nodes[out_idx];
-    auto node_lod_tensor = outputs[out_idx];
-    int node_elem_num = node_lod_tensor->numel();
-    float *node_ptr = node_lod_tensor->data<float>();
-    std::cout << "==== output_fetch_nodes[" << out_idx
-              << "] =====" << std::endl;
-    std::cout << "node_id: " << node_id << std::endl;
-    std::cout << "node_elem_num: " << node_elem_num << std::endl;
-    std::cout << "PRINT_NODE_ELEM_NUM: " << PRINT_NODE_ELEM_NUM << std::endl;
-    PRINT_NODE_ELEM_NUM =
-        (node_elem_num > PRINT_NODE_ELEM_NUM) ? PRINT_NODE_ELEM_NUM : 0;
-    for (int eidx = 0; eidx < PRINT_NODE_ELEM_NUM; ++eidx) {
-      std::cout << node_id << "[" << eidx << "]: " << node_ptr[eidx]
-                << std::endl;
-    }
-    std::cout << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_googlenet.cpp b/mobile/test/net/test_googlenet.cpp
deleted file mode 100644
index ea6c6ce1556f0332165f38a7491ca5f0230bac6c..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_googlenet.cpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <sstream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char *argv[]) {
-  if (argc < 4) {
-    std::cout << "Usage: ./test_googlenet fluid-model input-image image-shape "
-                 "[thread-num] [fusion]\n"
-              << " fluid-model: fluid model path. \n"
-              << " input-image: input raw image path. \n"
-              << " image-shape: input tensor shape, such as 1,3,224,224.\n"
-              << " thread-num: optional int, threads count, default is 1.\n"
-              << " fusion: optional bool, default is 0.\n";
-    return 1;
-  }
-  int thread_num = 1;
-  bool optimize = false;
-  char *fluid_model = argv[1];
-  char *input_img = argv[2];
-  char *feed_shape = argv[3];
-  if (argc >= 5) {
-    thread_num = atoi(argv[4]);
-  }
-  if (argc >= 6) {
-    optimize = atoi(argv[5]);
-  }
-#ifdef PADDLE_MOBILE_FPGA
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-#endif
-#ifdef PADDLE_MOBILE_CPU
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  paddle_mobile.SetThreadNum(thread_num);
-  auto time1 = time();
-  std::vector<float> output;
-  if (paddle_mobile.Load(fluid_model, optimize, false, 1, true)) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    if (feed_shape) {
-      sscanf(feed_shape, "%lld,%lld,%lld,%lld", &dims[0], &dims[1], &dims[2],
-             &dims[3]);
-    }
-    std::cout << "feed shape: [" << dims[0] << ", " << dims[1] << ", "
-              << dims[2] << ", " << dims[3] << "]" << std::endl;
-
-    GetInput<float>(input_img, &input, dims);
-
-    // warmup
-    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      output = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost: " << time_diff(time3, time4) / 10 << "ms\n";
-
-    std::ostringstream os;
-    os << output[0];
-    for (int i = 1; i < output.size(); ++i) {
-      os << ", " << output[i];
-    }
-    DLOG << os.str();
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_googlenet_quali.cpp b/mobile/test/net/test_googlenet_quali.cpp
deleted file mode 100644
index 28cb6207d7087939e6265a3fd636d6c2526cff53..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_googlenet_quali.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_FPGA
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-#endif
-
-#ifdef PADDLE_MOBILE_CPU
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-
-  paddle_mobile.SetThreadNum(4);
-  bool optimize = true;
-  bool quli = true;
-  auto time1 = time();
-  auto isok = paddle_mobile.Load(std::string(g_googlenet_quali) + "/model",
-                                 std::string(g_googlenet_quali) + "/params",
-                                 optimize, quli);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_googlenetv1_combine.cpp b/mobile/test/net/test_googlenetv1_combine.cpp
deleted file mode 100644
index 9aab25afd2aa6ece4e6b99bbd368b8a5be2e3106..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_googlenetv1_combine.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_googlenetv1_combined) + "/model",
-                         std::string(g_googlenetv1_combined) + "/params",
-                         false)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 160, 160};
-    GetInput<float>(g_img, &input, dims);
-
-    for (int i = 0; i < input.size(); i += 1000) {
-      std::cout << input[i] << std::endl;
-    }
-    //    auto vec_result = paddle_mobile.Predict(input, dims);
-    //    std::vector<float>::iterator biggest =
-    //        std::max_element(std::begin(vec_result), std::end(vec_result));
-    //    std::cout << " Max element is " << *biggest << " at position "
-    //              << std::distance(std::begin(vec_result), biggest) <<
-    //              std::endl;
-
-    //    // 预热十次
-    //    for (int i = 0; i < 1; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-    auto time3 = time();
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-
-    for (int j = 0; j < vec_result.size(); ++j) {
-      std::cout << j << " : " << vec_result[j] << std::endl;
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_inceptionv4.cpp b/mobile/test/net/test_inceptionv4.cpp
deleted file mode 100644
index fbbc9dd39e64f7a8ea745cf7489e46f00ffe1413..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_inceptionv4.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_inceptionv4, true);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    //        DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_inference_ercy.cpp b/mobile/test/net/test_inference_ercy.cpp
deleted file mode 100644
index 76997bcb8f7f8fdd9f96f6c8b403006823c4724b..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_inference_ercy.cpp
+++ /dev/null
@@ -1,129 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "io/paddle_inference_api.h"
-
-using namespace paddle_mobile;  // NOLINT
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kGPU_CL;
-  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
-
-  config.prog_file = "../models/ercy/model";
-  config.param_file = "../models/ercy/params";
-  config.lod_mode = false;
-  config.load_when_predict = false;
-  return config;
-}
-
-int main() {
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  // reliable
-  int re_len = 1 * 1 * 64 * 72;
-  std::vector<float> re_v;
-  std::vector<int64_t> re_dims{1, 1, 64, 72};
-  GetInput<float>(g_test_image_1x3x224x224, &re_v, re_dims);
-
-  PaddleTensor re;
-  re.shape = std::vector<int>({1, 1, 64, 72});
-  re.data = PaddleBuf(re_v.data(), re_len * sizeof(float));
-  re.dtype = PaddleDType::FLOAT32;
-  re.layout = LayoutType::LAYOUT_CHW;
-
-  // grid
-  int grid_len = 1 * 64 * 72 * 2;
-  std::vector<float> grid_v;
-  std::vector<int64_t> grid_dims{1, 64, 72, 2};
-  GetInput<float>(g_test_image_1x3x224x224, &grid_v, grid_dims);
-
-  PaddleTensor grid;
-  grid.shape = std::vector<int>({1, 64, 72, 2});
-  grid.data = PaddleBuf(grid_v.data(), grid_len * sizeof(float));
-  grid.dtype = PaddleDType::FLOAT32;
-  grid.layout = LayoutType::LAYOUT_CHW;
-
-  // last_input
-  int last_len = 1 * 128 * 64 * 72;
-  std::vector<float> last_v;
-  std::vector<int64_t> last_dims{1, 128, 64, 72};
-  GetInput<float>(g_test_image_1x3x224x224, &last_v, last_dims);
-
-  PaddleTensor last;
-  last.shape = std::vector<int>({1, 128, 64, 72});
-  last.data = PaddleBuf(last_v.data(), last_len * sizeof(float));
-  last.dtype = PaddleDType::FLOAT32;
-  last.layout = LayoutType::LAYOUT_CHW;
-
-  // input_rgb
-  int input_rgb_len = 1 * 4 * 256 * 288;
-  std::vector<float> input_rgb_v;
-  std::vector<int64_t> input_rgb_dims{1, 4, 256, 288};
-  GetInput<float>(g_test_image_1x3x224x224, &input_rgb_v, input_rgb_dims);
-
-  PaddleTensor input_rgb;
-  input_rgb.shape = std::vector<int>({1, 4, 256, 288});
-  input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
-  input_rgb.dtype = PaddleDType::FLOAT32;
-  input_rgb.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output0;
-  output0.shape = std::vector<int>({});
-  output0.data = PaddleBuf();
-  output0.dtype = PaddleDType::FLOAT32;
-  output0.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output1;
-  output1.shape = std::vector<int>({});
-  output1.data = PaddleBuf();
-  output1.dtype = PaddleDType::FLOAT32;
-  output1.layout = LayoutType::LAYOUT_CHW;
-
-  predictor->Feed("reliable", re);
-  predictor->Feed("grid", grid);
-  predictor->Feed("last_input", last);
-  predictor->Feed("input_rgb", input_rgb);
-  predictor->Run();
-  predictor->Fetch("save_infer_model/scale_0", &output0);
-  predictor->Fetch("save_infer_model/scale_1", &output1);
-
-  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
-  float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
-  std::cout << " print output0 : " << std::endl;
-  int numel = output0.data.length() / sizeof(float);
-  int stride = numel / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < numel; j += stride) {
-    std::cout << out_ptr0[j] << " ";
-  }
-  std::cout << std::endl;
-
-  std::cout << " print output1 : " << std::endl;
-  numel = output1.data.length() / sizeof(float);
-  stride = numel / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < numel; j += stride) {
-    std::cout << out_ptr1[j] << " ";
-  }
-  std::cout << std::endl;
-
-  return 0;
-}
diff --git a/mobile/test/net/test_inference_imfix.cpp b/mobile/test/net/test_inference_imfix.cpp
deleted file mode 100644
index dacc35f7d0cb51ba25c344e32c21d1d78aa923f7..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_inference_imfix.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "io/paddle_inference_api.h"
-
-using namespace paddle_mobile;  // NOLINT
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kGPU_CL;
-  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
-
-  config.prog_file = "../models/imagefixmodel/model";
-  config.param_file = "../models/imagefixmodel/params";
-  config.lod_mode = false;
-  config.load_when_predict = false;
-  return config;
-}
-
-int main() {
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  // factor
-  int input_rgb_len = 1 * 3 * 256 * 256;
-  std::vector<float> input_rgb_v(input_rgb_len, 1);
-  // SetupData<float>(input_rgb_v.data(), input_rgb_len, 0.f, 1.f);
-
-  PaddleTensor input_rgb;
-  input_rgb.shape = std::vector<int>({1, 3, 256, 256});
-  input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
-  input_rgb.dtype = PaddleDType::FLOAT32;
-  input_rgb.layout = LayoutType::LAYOUT_CHW;
-
-  // remap
-  int input_mask_len = 1 * 3 * 256 * 256;
-  std::vector<float> input_mask_v(input_mask_len, 1);
-  // SetupData<float>(input_mask_v.data(), input_mask_len, 0.f, 1.f);
-
-  PaddleTensor input_mask;
-  input_mask.shape = std::vector<int>({1, 3, 256, 256});
-  input_mask.data =
-      PaddleBuf(input_mask_v.data(), input_mask_len * sizeof(float));
-  input_mask.dtype = PaddleDType::FLOAT32;
-  input_mask.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output0;
-  output0.shape = std::vector<int>({});
-  output0.data = PaddleBuf();
-  output0.dtype = PaddleDType::FLOAT32;
-  output0.layout = LayoutType::LAYOUT_CHW;
-
-  // PaddleTensor output1;
-  // output1.shape = std::vector<int>({});
-  // output1.data = PaddleBuf();
-  // output1.dtype = PaddleDType::FLOAT32;
-  // output1.layout = LayoutType::LAYOUT_CHW;
-
-  // PaddleTensor output2;
-  // output2.shape = std::vector<int>({});
-  // output2.data = PaddleBuf();
-  // output2.dtype = PaddleDType::FLOAT32;
-  // output2.layout = LayoutType::LAYOUT_CHW;
-
-  // PaddleTensor output3;
-  // output3.shape = std::vector<int>({});
-  // output3.data = PaddleBuf();
-  // output3.dtype = PaddleDType::FLOAT32;
-  // output3.layout = LayoutType::LAYOUT_CHW;
-  std::cout << "feed : " << std::endl;
-
-  predictor->Feed("input_rgb", input_rgb);
-
-  std::cout << "feed : " << std::endl;
-
-  predictor->Feed("input_mask", input_mask);
-
-  std::cout << "run : " << std::endl;
-
-  predictor->Run();
-
-  std::cout << "fetch : " << std::endl;
-
-  predictor->Fetch("save_infer_model/scale_0", &output0);
-
-  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
-  std::cout << " print output0 : " << std::endl;
-  int numel = output0.data.length() / sizeof(float);
-  int stride = numel / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < numel; j += stride) {
-    std::cout << out_ptr0[j] << " ";
-  }
-  std::cout << std::endl;
-
-  return 0;
-}
diff --git a/mobile/test/net/test_inference_m2fm.cpp b/mobile/test/net/test_inference_m2fm.cpp
deleted file mode 100644
index b40c81ee544346e2db947b2c4a3a990d90d6f666..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_inference_m2fm.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "io/paddle_inference_api.h"
-
-using namespace paddle_mobile;  // NOLINT
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kGPU_CL;
-  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
-
-  config.prog_file = "../models/gan_yanlong_check2/model";
-  config.param_file = "../models/gan_yanlong_check2/params";
-  config.lod_mode = false;
-  config.load_when_predict = false;
-  return config;
-}
-
-int main() {
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  // factor
-  int factor_len = 1 * 256 * 1 * 1;
-  std::vector<float> factor_v;
-  std::vector<int64_t> factor_dims{1, 256, 1, 1};
-  GetInput<float>(g_test_image_1x3x224x224, &factor_v, factor_dims);
-
-  PaddleTensor factor;
-  factor.shape = std::vector<int>({1, 256, 1, 1});
-  factor.data = PaddleBuf(factor_v.data(), factor_len * sizeof(float));
-  factor.dtype = PaddleDType::FLOAT32;
-  factor.layout = LayoutType::LAYOUT_CHW;
-
-  // remap
-  int remap_len = 1 * 256 * 256 * 2;
-  std::vector<float> remap_v;
-  std::vector<int64_t> remap_dims{1, 256, 256, 2};
-  GetInput<float>(g_test_image_1x3x224x224, &remap_v, remap_dims);
-
-  PaddleTensor remap;
-  remap.shape = std::vector<int>({1, 256, 256, 2});
-  remap.data = PaddleBuf(remap_v.data(), remap_len * sizeof(float));
-  remap.dtype = PaddleDType::FLOAT32;
-  remap.layout = LayoutType::LAYOUT_CHW;
-
-  // image
-  int image_len = 1 * 3 * 256 * 256;
-  std::vector<float> image_v;
-  std::vector<int64_t> image_dims{1, 3, 256, 256};
-  GetInput<float>(g_test_image_1x3x224x224, &image_v, image_dims);
-
-  PaddleTensor image;
-  image.shape = std::vector<int>({1, 3, 256, 256});
-  image.data = PaddleBuf(image_v.data(), image_len * sizeof(float));
-  image.dtype = PaddleDType::FLOAT32;
-  image.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output0;
-  output0.shape = std::vector<int>({});
-  output0.data = PaddleBuf();
-  output0.dtype = PaddleDType::FLOAT32;
-  output0.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output1;
-  output1.shape = std::vector<int>({});
-  output1.data = PaddleBuf();
-  output1.dtype = PaddleDType::FLOAT32;
-  output1.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output2;
-  output2.shape = std::vector<int>({});
-  output2.data = PaddleBuf();
-  output2.dtype = PaddleDType::FLOAT32;
-  output2.layout = LayoutType::LAYOUT_CHW;
-
-  PaddleTensor output3;
-  output3.shape = std::vector<int>({});
-  output3.data = PaddleBuf();
-  output3.dtype = PaddleDType::FLOAT32;
-  output3.layout = LayoutType::LAYOUT_CHW;
-
-  predictor->Feed("x2paddle_mul_factor", factor);
-  predictor->Feed("x2paddle_base_remap", remap);
-  predictor->Feed("x2paddle_image", image);
-  predictor->Run();
-  predictor->Fetch("save_infer_model/scale_0", &output0);
-  predictor->Fetch("save_infer_model/scale_1", &output1);
-  predictor->Fetch("save_infer_model/scale_2", &output2);
-  predictor->Fetch("save_infer_model/scale_3", &output3);
-
-  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
-  float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
-  std::cout << " print output0 : " << std::endl;
-  int numel = output0.data.length() / sizeof(float);
-  int stride = numel / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < numel; j += stride) {
-    std::cout << out_ptr0[j] << " ";
-  }
-  std::cout << std::endl;
-
-  std::cout << " print output1 : " << std::endl;
-  numel = output1.data.length() / sizeof(float);
-  stride = numel / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < numel; j += stride) {
-    std::cout << out_ptr1[j] << " ";
-  }
-  std::cout << std::endl;
-
-  return 0;
-}
diff --git a/mobile/test/net/test_inference_pre_post.cpp b/mobile/test/net/test_inference_pre_post.cpp
deleted file mode 100644
index 39dc9429208e260b0ac1fe1edeb6dcfa1c9a4112..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_inference_pre_post.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "io/paddle_inference_api.h"
-
-using namespace paddle_mobile;  // NOLINT
-
-PaddleMobileConfig GetConfig() {
-  PaddleMobileConfig config;
-  config.precision = PaddleMobileConfig::FP32;
-  config.device = PaddleMobileConfig::kGPU_CL;
-  config.pre_post_type = PaddleMobileConfig::UINT8_255;
-
-  config.prog_file = "../models/superv2/model";
-  config.param_file = "../models/superv2/params";
-  config.lod_mode = false;
-  config.load_when_predict = true;
-  config.cl_path = "/data/local/tmp/bin";
-  return config;
-}
-
-int main() {
-  PaddleMobileConfig config = GetConfig();
-  auto predictor =
-      CreatePaddlePredictor<PaddleMobileConfig,
-                            PaddleEngineKind::kPaddleMobile>(config);
-
-  int input_length = 1 * 1 * 300 * 300;
-  int output_length = input_length;
-
-  uint8_t data_ui[300 * 300];
-  for (int i = 0; i < input_length; ++i) {
-    data_ui[i] = i % 256;
-  }
-
-  PaddleTensor input;
-  input.shape = std::vector<int>({1, 1, 300, 300});
-  input.data = PaddleBuf(data_ui, sizeof(data_ui));
-  input.dtype = PaddleDType::UINT8;
-  input.layout = LayoutType::LAYOUT_CHW;
-  std::vector<PaddleTensor> inputs(1, input);
-
-  PaddleTensor output;
-  output.shape = std::vector<int>({});
-  output.data = PaddleBuf();
-  output.dtype = PaddleDType::UINT8;
-  output.layout = LayoutType::LAYOUT_CHW;
-  std::vector<PaddleTensor> outputs(1, output);
-
-  std::cout << " print input : " << std::endl;
-  int stride = input_length / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < input_length; j += stride) {
-    std::cout << (unsigned)data_ui[j] << " ";
-  }
-  std::cout << std::endl;
-
-  predictor->Run(inputs, &outputs);
-
-  std::cout << " print output : " << std::endl;
-  uint8_t *data_o = static_cast<uint8_t *>(outputs[0].data.data());
-  int numel = outputs[0].data.length() / sizeof(uint8_t);
-  stride = numel / 20;
-  stride = stride > 0 ? stride : 1;
-  for (size_t j = 0; j < numel; j += stride) {
-    std::cout << (unsigned)data_o[j] << " ";
-  }
-  std::cout << std::endl;
-
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet+ssd.cpp b/mobile/test/net/test_mobilenet+ssd.cpp
deleted file mode 100644
index 85083ca441ad242ffb5b63dd612a0e35e3589f99..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_mobilenet+ssd.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  auto isok = paddle_mobile.Load(
-      std::string(g_mobilenet_ssd_gesture) + "/model",
-      std::string(g_mobilenet_ssd_gesture) + "/params", true);
-  //  auto isok = paddle_mobile.Load(g_mobilenet_ssd, false);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 300, 300};
-    GetInput<float>(g_hand, &input, dims);
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto output = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto output = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet.cpp b/mobile/test/net/test_mobilenet.cpp
deleted file mode 100644
index 5cce53e866df0530d6c8e1f35bc7159ba6e5ba9b..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_mobilenet.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = paddle_mobile::time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_mobilenet, true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    DLOG << vec_result;
-    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4) / 10
-              << "ms" << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_025_fssd.cpp b/mobile/test/net/test_mobilenet_025_fssd.cpp
deleted file mode 100644
index c0d037ceb05f57361f1385cb9959beed66186e4f..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_mobilenet_025_fssd.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char **argv) {
-  int times = 10;
-  if (argc <= 1) {
-    times = 10;
-    std::cout << "没有输入 , 使用默认10次 " << times << std::endl;
-  } else {
-    std::string arstr = argv[1];
-    times = std::stoi(arstr);
-    std::cout << "input times: " << times << std::endl;
-  }
-
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  auto isok =
-      paddle_mobile.Load(std::string(g_fluid_fssd_new) + "/model",
-                         std::string(g_fluid_fssd_new) + "/params", true);
-  if (isok) {
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 160, 160};
-    GetInput<float>(g_imgfssd_ar1, &input, dims);
-    std::cout << "预热10次....." << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto output = paddle_mobile.Predict(input, dims);
-    }
-    std::cout << "开始....." << std::endl;
-
-    double time_sum = 0;
-
-    for (int i = 0; i < times; ++i) {
-      auto time3 = time();
-      auto output = paddle_mobile.Predict(input, dims);
-      auto time4 = time();
-      double timeDiff = time_diff(time3, time4);
-      time_sum += timeDiff;
-      std::cout << "第" << i << "次"
-                << "predict cost :" << timeDiff << "ms" << std::endl;
-    }
-    std::cout << "平均时间:" << time_sum / times << "ms" << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_GPU.cpp b/mobile/test/net/test_mobilenet_GPU.cpp
deleted file mode 100644
index 8848f23d397c80cc1f4d3abda0c064cda659b841..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_mobilenet_GPU.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../../src/common/types.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main(int argc, char **argv) {
-  // init input args
-  string model_dir = g_mobilenet;
-  int64_t N = 1;
-  int64_t C = 3;
-  int64_t H = 224;
-  int64_t W = 224;
-  int repeats = 10;
-  int warmup = 10;
-  int print_output_elem = 0;
-
-  std::cout << "argc:" << argc << std::endl;
-  if (argc > 1 && argc < 9) {
-    std::cout << "usage:" << argv[0] << "\n"
-              << " <model_dir>\n"
-              << " <input_n>\n"
-              << " <input_c>\n"
-              << " <input_h>\n"
-              << " <input_w>\n"
-              << " <repeats>\n"
-              << " <warmup>\n"
-              << " <print_output>" << std::endl;
-    return 0;
-  }
-
-  if (argc >= 9) {
-    model_dir = argv[1];
-    N = atoi(argv[2]);
-    C = atoi(argv[3]);
-    H = atoi(argv[4]);
-    W = atoi(argv[5]);
-    repeats = atoi(argv[6]);
-    warmup = atoi(argv[7]);
-    print_output_elem = atoi(argv[8]);
-  }
-
-  std::cout << "input shape(NCHW):" << N << " " << C << " " << H << " " << W
-            << std::endl;
-  std::cout << "repeats:" << repeats << std::endl;
-  std::cout << "model_dir:" << model_dir << std::endl;
-
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  //    paddle_mobile.SetThreadNum(4);
-  auto load_start = paddle_mobile::time();
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#endif
-
-  auto load_model_status = paddle_mobile.Load(std::string(model_dir), true);
-  if (!load_model_status) {
-    std::cout << "failed to load model from:" << model_dir << std::endl;
-    return 0;
-  }
-
-  auto load_end = paddle_mobile::time();
-  std::cout << "load cost:" << paddle_mobile::time_diff(load_start, load_end)
-            << " ms" << std::endl;
-
-  // input tensor
-  std::vector<float> input;
-  std::vector<int64_t> dims{N, C, H, W};
-  GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-  // warmup
-  std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
-  for (int widx = 0; widx < warmup; ++widx) {
-    paddle_mobile.Predict(input, dims);
-  }
-
-  // benchmark
-  float sum_duration = 0.0f;
-  float min_duration = 1e5f;
-  float max_duration = 1e-5f;
-  float ave_duration = -1;
-  for (int ridx = 0; ridx < repeats; ++ridx) {
-    auto start = paddle_mobile::time();
-    vec_result = paddle_mobile.Predict(input, dims);
-    auto end = paddle_mobile::time();
-    auto duration = paddle_mobile::time_diff(start, end);
-    sum_duration += duration;
-    min_duration = (duration > min_duration) ? min_duration : duration;
-    max_duration = (duration < max_duration) ? max_duration : duration;
-    std::cout << "ridx:" << ridx + 1 << "/" << repeats << " " << duration
-              << " ms" << std::endl;
-  }
-
-  // benchmark result
-  ave_duration = sum_duration / static_cast<float>(repeats);
-
-  // output result
-  float output_sum = 0;
-  float output_ave = -1;
-  for (size_t oidx = 0; oidx < vec_result.size(); ++oidx) {
-    output_sum += vec_result[oidx];
-    if (print_output_elem) {
-      std::cout << "out_idx:" << oidx << " " << vec_result[oidx] << std::endl;
-    }
-  }
-  output_ave = output_sum / static_cast<float>(vec_result.size());
-  std::vector<float>::iterator biggest =
-      std::max_element(std::begin(vec_result), std::end(vec_result));
-
-  // summary
-  std::cout << "===== predict benchmark ====" << std::endl
-            << "run repeats:" << repeats << std::endl
-            << "sum_duration:" << sum_duration << " ms" << std::endl
-            << "ave_duration:" << ave_duration << " ms" << std::endl
-            << "max_duration:" << max_duration << " ms" << std::endl
-            << "min_duration:" << min_duration << " ms" << std::endl
-            << "\n===== predict result ====" << std::endl
-            << "output_sum:" << output_sum << std::endl
-            << "output_ave:" << output_ave << std::endl
-            << "output_size:" << vec_result.size() << std::endl
-            << "Max element is " << *biggest << " at position "
-            << std::distance(std::begin(vec_result), biggest) << std::endl
-            << "Note: 如果结果Nan请查看:"
-               " test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_combine.cpp b/mobile/test/net/test_mobilenet_combine.cpp
deleted file mode 100644
index af00085b6d919553cfb4669a1c7da807ec24f87d..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_mobilenet_combine.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-
-  if (paddle_mobile.Load(
-          std::string(g_mobilenet_vision) + "/vision_mobilenet_model",
-          std::string(g_mobilenet_vision) + "/vision_mobilenet_params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-
-    GetInput<float>(g_test_image_1x3x224x224_vision_mobilenet_input, &input,
-                    dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-
-    auto time3 = time();
-    for (int i = 0; i < 1; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  std::cout
-      << "如果结果Nan请查看: test/images/test_image_1x3x224x224_float 是否存在?"
-      << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_mobilenet_male2fe.cpp b/mobile/test/net/test_mobilenet_male2fe.cpp
deleted file mode 100644
index eb83b5bafe73a52c88a2408715eb4ffd2dff4676..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_mobilenet_male2fe.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../../src/common/types.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
-          std::string image_path, std::string feed_name) {
-  float *input_data_array = new float[product(dims)];
-  std::ifstream in(image_path, std::ios::in);
-  for (int i = 0; i < product(dims); i++) {
-    float num;
-    in >> num;
-    input_data_array[i] = num;
-  }
-  in.close();
-  framework::Tensor input_tensor(input_data_array, dims);
-  DLOG << feed_name << " : " << input_tensor;
-  paddle_mobile->Feed(feed_name, input_tensor);
-}
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  auto time1 = paddle_mobile::time();
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#endif
-
-  if (paddle_mobile.Load(std::string("../models/nanbiannv") + "/model",
-                         std::string("../models/nanbiannv") + "/params",
-                         true)) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    feed(&paddle_mobile, {1, 3, 256, 256}, "../images/input_1_3_256_256",
-         "image");
-
-    auto time3 = paddle_mobile::time();
-    paddle_mobile.Predict();
-    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4)
-              << "ms" << std::endl;
-  }
-
-  auto rgb = paddle_mobile.Fetch("rgb");
-  auto mask = paddle_mobile.Fetch("mask");
-  LOG(kLOG_INFO) << "rgb" << *rgb;
-  LOG(kLOG_INFO) << "mask" << *mask;
-  return 0;
-}
diff --git a/mobile/test/net/test_multi_inference_predict.cpp b/mobile/test/net/test_multi_inference_predict.cpp
deleted file mode 100644
index 8d97fee8c32b1a7d742042b3b17e17e891433226..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_multi_inference_predict.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <thread>  // NOLINT
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void fun_yolo();
-int fun_mobilenet();
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile2;
-
-  //  fun_yolo();
-  //  fun_mobilenet();
-
-  std::thread t1(fun_yolo);
-  std::thread t2(fun_mobilenet);
-
-  t1.join();
-  t2.join();
-
-  return 0;
-}
-
-void fun_yolo() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-  if (paddle_mobile.Load(g_yolo, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    vector<int64_t> dims{1, 3, 227, 227};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    vector<float> input(input_tensor.data<float>(),
-                        input_tensor.data<float>() + input_tensor.numel());
-
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "thread 1:   predict cost :" << time_diff(time3, time4) / 10
-              << "ms" << std::endl;
-  }
-}
-
-int fun_mobilenet() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_mobilenet, true);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    vector<float> input;
-    vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-    auto biggest = max_element(begin(vec_result), end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << distance(begin(vec_result), biggest) << std::endl;
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      auto vec_result = paddle_mobile.Predict(input, dims);
-    }
-    DLOG << vec_result;
-    auto time4 = time();
-    std::cout << "thread 2:  predict cost :" << time_diff(time3, time4) / 10
-              << "ms" << std::endl;
-  }
-
-  std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana "
-               "是否存在?"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_net.cpp b/mobile/test/net/test_net.cpp
deleted file mode 100644
index 3d5386513be09adc50b153bde6335f7cac00c107..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_net.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void test(int argc, char *argv[]);
-
-int main(int argc, char *argv[]) {
-  test(argc, argv);
-  return 0;
-}
-
-void test(int argc, char *argv[]) {
-  int arg_index = 1;
-  bool fuse = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool quantification = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int quantification_fold = std::stoi(argv[arg_index]);
-  arg_index++;
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.memory_optimization_level = enable_memory_optimization
-                                         ? MemoryOptimizationWithoutFeeds
-                                         : NoMemoryOptimization;
-
-  // save obfuscated model
-  // config.model_obfuscate_key = "asdf";
-  // std::ofstream out_file("new-params", std::ofstream::binary);
-  // char *out_data = ReadFileToBuff("./checked_model/params");
-  // int len = GetFileLength("./checked_model/params");
-  // out_file.write(out_data, len);
-  // out_file.close();
-
-#ifdef PADDLE_MOBILE_CL
-  //  config.load_when_predict = true;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  std::cout << "testing opencl yyz " << std::endl;
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "testing cpu yyz " << std::endl;
-#endif
-
-  int dim_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  int size = 1;
-  std::vector<int64_t> dims;
-  for (int i = 0; i < dim_count; i++) {
-    int64_t dim = std::stoi(argv[arg_index + i]);
-    size *= dim;
-    dims.push_back(dim);
-  }
-  arg_index += dim_count;
-
-  bool is_lod = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  paddle_mobile::framework::LoD lod{{}};
-  if (is_lod) {
-    int lod_count = std::stoi(argv[arg_index]);
-    arg_index++;
-    for (int i = 0; i < lod_count; i++) {
-      int dim = std::stoi(argv[arg_index + i]);
-      lod[0].push_back(dim);
-    }
-    arg_index += lod_count;
-  }
-
-  int var_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  bool is_sample_step = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int sample_arg = std::stoi(argv[arg_index]);
-  int sample_step = sample_arg;
-  int sample_num = sample_arg;
-  arg_index++;
-  std::vector<std::string> var_names;
-  for (int i = 0; i < var_count; i++) {
-    std::string var_name = argv[arg_index + i];
-    var_names.push_back(var_name);
-  }
-  arg_index += var_count;
-  bool check_shape = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-
-  auto time1 = time();
-  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, quantification, 1, is_lod,
-                         quantification_fold)) {
-    auto time2 = time();
-    std::cout << "auto-test"
-              << " load-time-cost :" << time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    float *input_data_array = new float[size];
-    std::ifstream in("input.txt", std::ios::in);
-    for (int i = 0; i < size; i++) {
-      float num;
-      in >> num;
-      input_data_array[i] = num;
-    }
-    in.close();
-
-    auto time3 = time();
-    // std::vector<float> input_data;
-    // for (int i = 0; i < size; i++) {
-    //   float num = input_data_array[i];
-    //   input_data.push_back(num);
-    // }
-    // paddle_mobile::framework::Tensor input_tensor(input_data,
-    // paddle_mobile::framework::make_ddim(dims));
-    paddle_mobile::framework::Tensor input_tensor(
-        input_data_array, paddle_mobile::framework::make_ddim(dims));
-    auto time4 = time();
-    std::cout << "auto-test"
-              << " preprocess-time-cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-    paddle_mobile::framework::LoDTensor input_lod_tensor;
-    if (is_lod) {
-      input_lod_tensor.Resize(paddle_mobile::framework::make_ddim(dims));
-      input_lod_tensor.set_lod(lod);
-      auto *tensor_data = input_lod_tensor.mutable_data<float>();
-      for (int i = 0; i < size; i++) {
-        tensor_data[i] = input_data_array[i];
-      }
-    }
-
-    // // 预热10次
-    // for (int i = 0; i < 10; i++) {
-    //   if (is_lod) {
-    //     auto out = paddle_mobile.Predict(input_lod_tensor);
-    //   } else {
-    //     paddle_mobile.Feed(var_names[0], input_tensor);
-    //     paddle_mobile.Predict();
-    //   }
-    // }
-
-    // // 测速
-    // auto time5 = time();
-    // for (int i = 0; i < 50; i++) {
-    //   if (is_lod) {
-    //     auto out = paddle_mobile.Predict(input_lod_tensor);
-    //   } else {
-    //     paddle_mobile.Feed(var_names[0], input_tensor);
-    //     paddle_mobile.Predict();
-    //   }
-    // }
-    // auto time6 = time();
-    // std::cout << "auto-test"
-    //           << " predict-time-cost " << time_diff(time5, time6) / 50 <<
-    //           "ms"
-    //           << std::endl;
-
-    // 测试正确性
-    if (is_lod) {
-      auto out = paddle_mobile.Predict(input_lod_tensor);
-    } else {
-      paddle_mobile.Feed(var_names[0], input_tensor);
-      paddle_mobile.Predict();
-    }
-#ifdef PADDLE_MOBILE_CL
-    for (auto var_name : var_names) {
-      auto cl_image = paddle_mobile.FetchImage(var_name);
-      if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
-        continue;
-      }
-      auto len = cl_image->numel();
-      if (len == 0) {
-        continue;
-      }
-      size_t width = cl_image->ImageDims()[0];
-      size_t height = cl_image->ImageDims()[1];
-      paddle_mobile::framework::half_t *image_data =
-          new paddle_mobile::framework::half_t[height * width * 4];
-      cl_int err;
-      cl_mem image = cl_image->GetCLImage();
-      size_t origin[3] = {0, 0, 0};
-      size_t region[3] = {width, height, 1};
-      err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin,
-                               region, 0, 0, image_data, 0, NULL, NULL);
-      CL_CHECK_ERRORS(err);
-      float *tensor_data = new float[cl_image->numel()];
-      auto converter = cl_image->Converter();
-      converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(),
-                             cl_image->dims());
-
-      auto data = tensor_data;
-      std::string sample = "";
-      if (check_shape) {
-        for (int i = 0; i < cl_image->dims().size(); i++) {
-          sample += " " + std::to_string(cl_image->dims()[i]);
-        }
-      }
-      if (!is_sample_step) {
-        sample_step = len / sample_num;
-      }
-      if (sample_step <= 0) {
-        sample_step = 1;
-      }
-      for (int i = 0; i < len; i += sample_step) {
-        sample += " " + std::to_string(data[i]);
-      }
-      std::cout << "auto-test"
-                << " var " << var_name << sample << std::endl;
-    }
-#else
-    for (auto var_name : var_names) {
-      auto out = paddle_mobile.Fetch(var_name);
-      auto len = out->numel();
-      if (len == 0) {
-        continue;
-      }
-      if (out->memory_size() == 0) {
-        continue;
-      }
-      if (out->type() == type_id<int>()) {
-        auto data = out->data<int>();
-        std::string sample = "";
-        if (check_shape) {
-          for (int i = 0; i < out->dims().size(); i++) {
-            sample += " " + std::to_string(out->dims()[i]);
-          }
-        }
-        if (!is_sample_step) {
-          sample_step = len / sample_num;
-        }
-        if (sample_step <= 0) {
-          sample_step = 1;
-        }
-        for (int i = 0; i < len; i += sample_step) {
-          sample += " " + std::to_string(data[i]);
-        }
-        std::cout << "auto-test"
-                  << " var " << var_name << sample << std::endl;
-      } else if (out->type() == type_id<float>()) {
-        auto data = out->data<float>();
-        std::string sample = "";
-        if (check_shape) {
-          for (int i = 0; i < out->dims().size(); i++) {
-            sample += " " + std::to_string(out->dims()[i]);
-          }
-        }
-        if (!is_sample_step) {
-          sample_step = len / sample_num;
-        }
-        if (sample_step <= 0) {
-          sample_step = 1;
-        }
-        for (int i = 0; i < len; i += sample_step) {
-          sample += " " + std::to_string(data[i]);
-        }
-        std::cout << "auto-test"
-                  << " var " << var_name << sample << std::endl;
-      }
-    }
-#endif
-    std::cout << std::endl;
-  }
-}
diff --git a/mobile/test/net/test_net_benchmark.cpp b/mobile/test/net/test_net_benchmark.cpp
deleted file mode 100644
index 396f293f760a3bd8c134c3e3ab34b9f1e2b34219..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_net_benchmark.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.load_when_predict = false;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  paddle_mobile.SetThreadNum(1);
-  auto time1 = paddle_mobile::time();
-
-  auto isok = paddle_mobile.Load(std::string(g_mobilenet_combined) + "/model",
-                                 std::string(g_mobilenet_combined) + "/params",
-                                 true, false, 1, false);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 224, 224};
-    GetInput<float>(g_test_image_1x3x224x224_banana, &input, dims);
-
-    paddle_mobile::framework::DDim ddim =
-        paddle_mobile::framework::make_ddim(dims);
-    Tensor feed_tensor(input, paddle_mobile::framework::make_ddim(dims));
-
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      //      auto vec_result = paddle_mobile.Predict(input, dims);
-      paddle_mobile.Feed("data", feed_tensor);
-      paddle_mobile.Predict();
-    }
-    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < 100; ++i) {
-      //      auto vec_result = paddle_mobile.Predict(input, dims);
-      paddle_mobile.Feed("data", feed_tensor);
-      paddle_mobile.Predict();
-    }
-    auto time4 = paddle_mobile::time();
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / 100 << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp
deleted file mode 100644
index 5c04a76ad31928a1f89cfaa35b708b5291401481..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_net_multi_feed.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_MOBILE_CL
-
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void test(int argc, char *argv[]);
-
-void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
-          std::string feed_name) {
-  float *input_data_array = new float[product(dims)];
-  std::ifstream in(feed_name, std::ios::in);
-  for (int i = 0; i < product(dims); i++) {
-    float num;
-    in >> num;
-    input_data_array[i] = num;
-  }
-  in.close();
-  framework::Tensor input_tensor(input_data_array, dims);
-  DLOG << feed_name << " : " << input_tensor;
-  paddle_mobile->Feed(feed_name, input_tensor);
-}
-int main(int argc, char *argv[]) {
-  test(argc, argv);
-  return 0;
-}
-
-void test(int argc, char *argv[]) {
-  int arg_index = 1;
-  bool fuse = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool quantification = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int quantification_fold = std::stoi(argv[arg_index]);
-  arg_index++;
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.memory_optimization_level = enable_memory_optimization
-                                         ? MemoryOptimizationWithoutFeeds
-                                         : NoMemoryOptimization;
-
-#ifdef PADDLE_MOBILE_CL
-  //  config.load_when_predict = true;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  std::cout << "testing opencl yyz " << std::endl;
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "testing cpu yyz " << std::endl;
-#endif
-
-  int dim_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  int size = 1;
-
-  arg_index += dim_count;
-
-  bool is_lod = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  paddle_mobile::framework::LoD lod{{}};
-  if (is_lod) {
-    int lod_count = std::stoi(argv[arg_index]);
-    arg_index++;
-    for (int i = 0; i < lod_count; i++) {
-      int dim = std::stoi(argv[arg_index + i]);
-      lod[0].push_back(dim);
-    }
-    arg_index += lod_count;
-  }
-
-  int var_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  bool is_sample_step = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int sample_arg = std::stoi(argv[arg_index]);
-  int sample_step = sample_arg;
-  int sample_num = sample_arg;
-  arg_index++;
-  std::vector<std::string> var_names;
-  for (int i = 0; i < var_count; i++) {
-    std::string var_name = argv[arg_index + i];
-    var_names.push_back(var_name);
-  }
-  arg_index += var_count;
-  bool check_shape = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-
-  auto time1 = time();
-  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, quantification, 1, is_lod,
-                         quantification_fold)) {
-    auto time2 = time();
-    std::cout << "auto-test"
-              << " load-time-cost :" << time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    feed(&paddle_mobile, {1, 4, 256, 288}, "input_rgb");
-    feed(&paddle_mobile, {1, 128, 64, 72}, "last_input");
-    feed(&paddle_mobile, {1, 64, 72, 2}, "grid");
-    feed(&paddle_mobile, {1, 1, 64, 72}, "reliable");
-    paddle_mobile.Predict();
-
-#ifdef PADDLE_MOBILE_CL
-    for (auto var_name : var_names) {
-      auto cl_image = paddle_mobile.FetchImage(var_name);
-      if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
-        continue;
-      }
-      auto len = cl_image->numel();
-      if (len == 0) {
-        continue;
-      }
-      size_t width = cl_image->ImageDims()[0];
-      size_t height = cl_image->ImageDims()[1];
-      paddle_mobile::framework::half_t *image_data =
-          new paddle_mobile::framework::half_t[height * width * 4];
-      cl_int err;
-      cl_mem image = cl_image->GetCLImage();
-      size_t origin[3] = {0, 0, 0};
-      size_t region[3] = {width, height, 1};
-      err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin,
-                               region, 0, 0, image_data, 0, NULL, NULL);
-      CL_CHECK_ERRORS(err);
-      float *tensor_data = new float[cl_image->numel()];
-      auto converter = cl_image->Converter();
-      converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(),
-                             cl_image->dims());
-
-      auto data = tensor_data;
-      std::string sample = "";
-      if (check_shape) {
-        for (int i = 0; i < cl_image->dims().size(); i++) {
-          sample += " " + std::to_string(cl_image->dims()[i]);
-        }
-      }
-      if (!is_sample_step) {
-        sample_step = len / sample_num;
-      }
-      if (sample_step <= 0) {
-        sample_step = 1;
-      }
-      for (int i = 0; i < len; i += sample_step) {
-        sample += " " + std::to_string(data[i]);
-      }
-      std::cout << "auto-test"
-                << " var " << var_name << sample << std::endl;
-    }
-#else
-    for (auto var_name : var_names) {
-      auto out = paddle_mobile.Fetch(var_name);
-      auto len = out->numel();
-      if (len == 0) {
-        continue;
-      }
-      if (out->memory_size() == 0) {
-        continue;
-      }
-      if (out->type() == type_id<int>()) {
-        auto data = out->data<int>();
-        std::string sample = "";
-        if (check_shape) {
-          for (int i = 0; i < out->dims().size(); i++) {
-            sample += " " + std::to_string(out->dims()[i]);
-          }
-        }
-        if (!is_sample_step) {
-          sample_step = len / sample_num;
-        }
-        if (sample_step <= 0) {
-          sample_step = 1;
-        }
-        for (int i = 0; i < len; i += sample_step) {
-          sample += " " + std::to_string(data[i]);
-        }
-        std::cout << "auto-test"
-                  << " var " << var_name << sample << std::endl;
-      } else if (out->type() == type_id<float>()) {
-        auto data = out->data<float>();
-        std::string sample = "";
-        if (check_shape) {
-          for (int i = 0; i < out->dims().size(); i++) {
-            sample += " " + std::to_string(out->dims()[i]);
-          }
-        }
-        if (!is_sample_step) {
-          sample_step = len / sample_num;
-        }
-        if (sample_step <= 0) {
-          sample_step = 1;
-        }
-        for (int i = 0; i < len; i += sample_step) {
-          sample += " " + std::to_string(data[i]);
-        }
-        std::cout << "auto-test"
-                  << " var " << var_name << sample << std::endl;
-      }
-    }
-#endif
-    std::cout << std::endl;
-  }
-}
-#else
-int main() {}
-#endif
diff --git a/mobile/test/net/test_net_performance.cpp b/mobile/test/net/test_net_performance.cpp
deleted file mode 100644
index ac4c71588b77332a8fe35a946da63d79becd5119..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_net_performance.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <unistd.h>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-void test(int argc, char *argv[]);
-
-int main(int argc, char *argv[]) {
-  test(argc, argv);
-  return 0;
-}
-
-void test(int argc, char *argv[]) {
-  int arg_index = 1;
-  bool fuse = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  bool quantification = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int quantification_fold = std::stoi(argv[arg_index]);
-  arg_index++;
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.memory_optimization_level = enable_memory_optimization
-                                         ? MemoryOptimizationWithoutFeeds
-                                         : NoMemoryOptimization;
-
-  // save obfuscated model
-  // config.model_obfuscate_key = "asdf";
-  // std::ofstream out_file("new-params", std::ofstream::binary);
-  // char *out_data = ReadFileToBuff("./checked_model/params");
-  // int len = GetFileLength("./checked_model/params");
-  // out_file.write(out_data, len);
-  // out_file.close();
-
-#ifdef PADDLE_MOBILE_CL
-  //  config.load_when_predict = true;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  std::cout << "testing opencl performance " << std::endl;
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "testing cpu performance " << std::endl;
-#endif
-
-  int dim_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  int size = 1;
-  std::vector<int64_t> dims;
-  for (int i = 0; i < dim_count; i++) {
-    int64_t dim = std::stoi(argv[arg_index + i]);
-    size *= dim;
-    dims.push_back(dim);
-  }
-  arg_index += dim_count;
-
-  bool is_lod = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  paddle_mobile::framework::LoD lod{{}};
-  if (is_lod) {
-    int lod_count = std::stoi(argv[arg_index]);
-    arg_index++;
-    for (int i = 0; i < lod_count; i++) {
-      int dim = std::stoi(argv[arg_index + i]);
-      lod[0].push_back(dim);
-    }
-    arg_index += lod_count;
-  }
-
-  int var_count = std::stoi(argv[arg_index]);
-  arg_index++;
-  bool is_sample_step = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-  int sample_arg = std::stoi(argv[arg_index]);
-  int sample_step = sample_arg;
-  int sample_num = sample_arg;
-  arg_index++;
-  std::vector<std::string> var_names;
-  for (int i = 0; i < var_count; i++) {
-    std::string var_name = argv[arg_index + i];
-    var_names.push_back(var_name);
-  }
-  arg_index += var_count;
-  bool check_shape = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-
-  int run_times = std::stoi(argv[arg_index]);
-  arg_index++;
-
-  bool warm_up = std::stoi(argv[arg_index]) == 1;
-  arg_index++;
-
-  auto time1 = time();
-  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, quantification, 1, is_lod,
-                         quantification_fold)) {
-    auto time2 = time();
-    std::cout << "auto-test"
-              << " load-time-cost :" << time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    float *input_data_array = new float[size];
-    std::ifstream in("input.txt", std::ios::in);
-    for (int i = 0; i < size; i++) {
-      float num;
-      in >> num;
-      input_data_array[i] = num;
-    }
-    in.close();
-
-    auto time3 = time();
-
-    paddle_mobile::framework::Tensor input_tensor(
-        input_data_array, paddle_mobile::framework::make_ddim(dims));
-    auto time4 = time();
-    std::cout << "auto-test"
-              << " preprocess-time-cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-    paddle_mobile::framework::LoDTensor input_lod_tensor;
-    if (is_lod) {
-      input_lod_tensor.Resize(paddle_mobile::framework::make_ddim(dims));
-      input_lod_tensor.set_lod(lod);
-      auto *tensor_data = input_lod_tensor.mutable_data<float>();
-      for (int i = 0; i < size; i++) {
-        tensor_data[i] = input_data_array[i];
-      }
-    }
-
-    // 预热10次
-    if (warm_up) {
-      for (int i = 0; i < 10; i++) {
-        if (is_lod) {
-          auto out = paddle_mobile.Predict(input_lod_tensor);
-        } else {
-          paddle_mobile.Feed(var_names[0], input_tensor);
-          paddle_mobile.Predict();
-        }
-      }
-    }
-
-    // 测速
-    auto max_time = -1;
-    auto min_time = 100000;
-    auto all_time = 0;
-    if (is_lod) {
-      for (int i = 0; i < run_times; i++) {
-        auto time7 = time();
-        paddle_mobile.Predict(input_lod_tensor);
-        auto time8 = time();
-        const double diff_time_single = time_diff(time7, time8);
-        max_time = fmax(diff_time_single, max_time);
-        min_time = fmin(diff_time_single, min_time);
-        all_time += diff_time_single;
-      }
-    } else {
-      paddle_mobile.Feed(var_names[0], input_tensor);
-      for (int i = 0; i < run_times; i++) {
-        auto time7 = time();
-        paddle_mobile.Predict();
-        auto time8 = time();
-        usleep(1000 * quantification_fold);
-        const double diff_time_single = time_diff(time7, time8);
-        max_time = fmax(diff_time_single, max_time);
-        min_time = fmin(diff_time_single, min_time);
-        all_time += diff_time_single;
-      }
-    }
-
-    std::cout << "auto-test"
-              << " predict-time-cost-avg " << all_time * 1.0f / run_times
-              << "ms" << std::endl;
-    std::cout << "auto-test"
-              << " predict-time-cost-max " << double(max_time) << "ms"
-              << std::endl;
-    std::cout << "auto-test"
-              << " predict-time-cost-min " << double(min_time) << "ms"
-              << std::endl;
-
-    std::cout << std::endl;
-  }
-}
diff --git a/mobile/test/net/test_nlp.cpp b/mobile/test/net/test_nlp.cpp
deleted file mode 100644
index db13e2da57469fbe041a8e80a810be9ff0ff4731..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_nlp.cpp
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  //  auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model",
-  //                     std::string(g_mobilenet_detect) + "/params", true);
-
-  auto isok = paddle_mobile.Load(g_nlp, true, false, 1, true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_nlp) + "/model",
-  //                                 std::string(g_nlp) + "/params", false);
-  if (isok) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
-
-    std::vector<int64_t> ids{1918, 117, 55, 97, 1352, 4272, 1656, 903};
-
-    paddle_mobile::framework::LoDTensor words;
-    auto size = static_cast<int>(ids.size());
-    paddle_mobile::framework::LoD lod{{0, ids.size()}};
-    DDim dims{size, 1};
-    words.Resize(dims);
-    words.set_lod(lod);
-    DLOG << "words lod : " << words.lod();
-    auto *pdata = words.mutable_data<int64_t>();
-    size_t n = words.numel() * sizeof(int64_t);
-    DLOG << "n :" << n;
-    memcpy(pdata, ids.data(), n);
-    DLOG << "words lod 22: " << words.lod();
-    auto time3 = time();
-    for (int i = 0; i < 1; ++i) {
-      paddle_mobile.Predict(words);
-      DLOG << *paddle_mobile.Fetch();
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
-              << std::endl;
-  }
-
-  auto time2 = time();
-  std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-  //    1064 1603 644 699 2878 1219 867 1352 8 1 13 312 479
-
-  std::vector<int64_t> ids{
-      2084, 635,  1035, 197,  990,  150,  1132, 2403, 546,  770,  4060, 3352,
-      1798, 1589, 1352, 98,   136,  3461, 3186, 1159, 515,  764,  278,  1178,
-      5044, 4060, 943,  932,  463,  1198, 3352, 374,  1198, 3352, 374,  2047,
-      1069, 1589, 3672, 1178, 1178, 2165, 1178, 2084, 635,  3087, 2236, 546,
-      2047, 1549, 546,  2047, 302,  2202, 398,  804,  397,  657,  804,  866,
-      932,  2084, 515,  2165, 397,  302,  2202, 526,  992,  906,  1215, 1589,
-      4493, 2403, 723,  932,  2084, 635,  1352, 932,  444,  2047, 1159, 1893,
-      1579, 59,   330,  98,   1296, 1159, 3430, 738,  3186, 1071, 2174, 3933};
-
-  paddle_mobile::framework::LoDTensor words;
-  auto size = static_cast<int>(ids.size());
-  paddle_mobile::framework::LoD lod{{0, ids.size()}};
-  DDim dims{size, 1};
-  words.Resize(dims);
-  words.set_lod(lod);
-  DLOG << "words lod : " << words.lod();
-  auto *pdata = words.mutable_data<int64_t>();
-  size_t n = words.numel() * sizeof(int64_t);
-  DLOG << "n :" << n;
-  memcpy(pdata, ids.data(), n);
-  DLOG << "words lod 22: " << words.lod();
-  auto time3 = time();
-  for (int i = 0; i < 1; ++i) {
-    paddle_mobile.Predict(words);
-    DLOG << *paddle_mobile.Fetch();
-  }
-  auto time4 = time();
-  std::cout << "predict cost :" << time_diff(time3, time4) / 1 << "ms"
-            << std::endl;
-  return 0;
-}
diff --git a/mobile/test/net/test_ocr.cpp b/mobile/test/net/test_ocr.cpp
deleted file mode 100644
index d7dde5406eef76e4ba3a5b280c2b2f9450350637..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_ocr.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-const int max_run_times = 10;
-
-int main(int argc, char **argv) {
-  if (argc < 3) {
-    std::cerr
-        << "Usage: ./test_ocr [detect_model_dir|recog_model_dir] image_path"
-        << std::endl;
-    return 1;
-  }
-  std::string model_dir = argv[1];
-  std::string image_path = argv[2];
-
-  // init input, output params
-  std::vector<float> input_vec;
-  std::vector<int64_t> input_shape;
-  std::vector<std::string> output_fetch_nodes;
-  int PRINT_NODE_ELEM_NUM = 10;
-
-  bool is_det_model = model_dir.find("detect") != string::npos;
-  if (is_det_model) {
-    input_shape.emplace_back(1);
-    input_shape.emplace_back(3);
-    input_shape.emplace_back(512);
-    input_shape.emplace_back(512);
-    output_fetch_nodes.emplace_back("sigmoid_0.tmp_0");
-    output_fetch_nodes.emplace_back("tmp_5");
-  } else {
-    input_shape.emplace_back(1);
-    input_shape.emplace_back(3);
-    input_shape.emplace_back(48);
-    input_shape.emplace_back(512);
-    output_fetch_nodes.emplace_back("top_k_1.tmp_0");
-    output_fetch_nodes.emplace_back("cast_330.tmp_0");
-  }
-  std::shared_ptr<framework::LoDTensor> outputs[output_fetch_nodes.size()];
-
-  // init paddle instance
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  std::cout << "start load " << std::endl;
-  auto load_success = paddle_mobile.Load(std::string(model_dir) + "/model",
-                                         std::string(model_dir) + "/params",
-                                         true, false, 1, true);
-  std::cout << "load_success:" << load_success << std::endl;
-  // input image raw tensor, generated by
-  // [scripts](tools/python/imagetools/img2nchw.py)
-  std::cout << "image_path: " << image_path << std::endl;
-  std::cout << "input_shape: " << input_shape[0] << ", " << input_shape[1]
-            << ", " << input_shape[2] << ", " << input_shape[3] << std::endl;
-  GetInput<float>(image_path, &input_vec, input_shape);
-
-  // model predict
-  auto pred_start_time = paddle_mobile::time();
-  for (int run_idx = 0; run_idx < max_run_times; ++run_idx) {
-    paddle_mobile.Predict(input_vec, input_shape);
-    for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-      auto fetch_name = output_fetch_nodes[out_idx];
-      outputs[out_idx] = paddle_mobile.Fetch(fetch_name);
-    }
-  }
-  auto pred_end_time = paddle_mobile::time();
-
-  // inference time
-  double pred_time =
-      paddle_mobile::time_diff(pred_start_time, pred_end_time) / max_run_times;
-  std::cout << "predict time(ms): " << pred_time << std::endl;
-
-  // output result
-  for (int out_idx = 0; out_idx < output_fetch_nodes.size(); ++out_idx) {
-    std::string node_id = output_fetch_nodes[out_idx];
-    auto node_lod_tensor = outputs[out_idx];
-    int node_elem_num = node_lod_tensor->numel();
-    float *node_ptr = node_lod_tensor->data<float>();
-    std::cout << "==== output_fetch_nodes[" << out_idx
-              << "] =====" << std::endl;
-    std::cout << "node_id: " << node_id << std::endl;
-    std::cout << "node_elem_num: " << node_elem_num << std::endl;
-    std::cout << "PRINT_NODE_ELEM_NUM: " << PRINT_NODE_ELEM_NUM << std::endl;
-    PRINT_NODE_ELEM_NUM =
-        (node_elem_num > PRINT_NODE_ELEM_NUM) ? PRINT_NODE_ELEM_NUM : 0;
-    for (int eidx = 0; eidx < PRINT_NODE_ELEM_NUM; ++eidx) {
-      std::cout << node_id << "[" << eidx << "]: " << node_ptr[eidx]
-                << std::endl;
-    }
-    std::cout << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_op_in_net.cpp b/mobile/test/net/test_op_in_net.cpp
deleted file mode 100644
index 9425c02762352ff4e1724cb95b4c9fc243a042e1..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_op_in_net.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include <string>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-void test(int argc, char *argv[]);
-
-int main(int argc, char *argv[]) {
-  test(argc, argv);
-  return 0;
-}
-
-void test(int argc, char *argv[]) {
-  std::vector<int64_t> dims{1, 8, 32, 32};
-  int op_index = 2;
-  std::string input_var_name = "ConvNdBackward2.conv2d.output.1.tmp_0";
-  std::vector<std::string> output_var_names{
-      "ConvNdBackward2.conv2d.output.1.tmp_1"};
-
-  bool fuse = false;
-  bool enable_memory_optimization = true;
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.memory_optimization_level = enable_memory_optimization
-                                         ? MemoryOptimizationWithoutFeeds
-                                         : NoMemoryOptimization;
-#ifdef PADDLE_MOBILE_CL
-  // config.load_when_predict = true;
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
-  paddle_mobile.SetThreadNum(1);
-#endif
-
-  int size = 1;
-  for (int i = 0; i < dims.size(); i++) {
-    size *= dims[i];
-  }
-
-  bool is_sample_step = false;
-  int sample_step = 1;
-  int sample_num = 20;
-
-  auto time1 = time();
-  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
-                         fuse, false, 1, true, 1)) {
-    auto time2 = time();
-    std::cout << "auto-test"
-              << " load-time-cost :" << time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    float input_data_array[size];
-    std::ifstream in("input.txt", std::ios::in);
-    for (int i = 0; i < size; i++) {
-      float num;
-      in >> num;
-      input_data_array[i] = num;
-    }
-    in.close();
-
-    auto time3 = time();
-    std::vector<float> input_data;
-    for (int i = 0; i < size; i++) {
-      float num = input_data_array[i];
-      input_data.push_back(num);
-    }
-    paddle_mobile::framework::Tensor input_tensor(
-        input_data, paddle_mobile::framework::make_ddim(dims));
-    auto time4 = time();
-    std::cout << "auto-test"
-              << " preprocess-time-cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-    // 测试正确性
-    // 以下代码依赖paddle_mobile.h及executor.h的属性可见性，如需使用，调整可见性后，放开注释
-    // auto *input_var =
-    //     paddle_mobile.executor_->program_.scope->FindVar(input_var_name);
-    // framework::LoDTensor *target =
-    //     input_var->template GetMutable<framework::LoDTensor>();
-    // target->Resize(input_tensor.dims());
-    // target->ShareDataWith(input_tensor);
-    // paddle_mobile.executor_->ops_of_block0_[op_index]->InferShape();
-    // paddle_mobile.executor_->ops_of_block0_[op_index]->Run();
-
-    for (auto var_name : output_var_names) {
-      auto out = paddle_mobile.Fetch(var_name);
-      auto len = out->numel();
-      if (len == 0) {
-        continue;
-      }
-      if (out->memory_size() == 0) {
-        continue;
-      }
-      auto data = out->data<float>();
-      std::string sample = "";
-      if (!is_sample_step) {
-        sample_step = len / sample_num;
-      }
-      if (sample_step <= 0) {
-        sample_step = 1;
-      }
-      for (int i = 0; i < len; i += sample_step) {
-        sample += " " + std::to_string(data[i]);
-      }
-      std::cout << "auto-test"
-                << " var " << var_name << sample << std::endl;
-    }
-    std::cout << std::endl;
-  }
-}
diff --git a/mobile/test/net/test_resnet.cpp b/mobile/test/net/test_resnet.cpp
deleted file mode 100644
index 9c60bd13cf2ff7a1375efb8e8a015440f27d61e4..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_resnet.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-#ifdef PADDLE_MOBILE_FPGA
-  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
-#endif
-
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  paddle_mobile.SetThreadNum(4);
-  auto time1 = time();
-  if (paddle_mobile.Load(g_resnet, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 3, 32, 32};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 32, 32}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-#ifndef PADDLE_MOBILE_FPGA
-    //   预热十次
-    //    for (int i = 0; i < 10; ++i) {
-    //      paddle_mobile.Predict(input, dims);
-    //    }
-    auto time3 = time();
-    //    for (int i = 0; i < 10; ++i) {
-    paddle_mobile.Predict(input, dims);
-    //    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;
-
-#else
-    auto time3 = time();
-    paddle_mobile.FeedData(input_tensor);
-    paddle_mobile.Predict_To(-1);
-    /*paddle_mobile.Predict_From(10);
-    auto tensor_ptr = paddle_mobile.FetchResult(9);
-    std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel()
-              << std::endl;
-    auto result_ptr = paddle_mobile.FetchResult();
-    std::cout << "Result tensor element number: " << result_ptr->numel()
-              << std::endl;
-
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) << "ms"
-              << std::endl;*/
-#endif
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_squeezenet.cpp b/mobile/test/net/test_squeezenet.cpp
deleted file mode 100644
index 02ec8691febbad5ec0e811f7d7bebde1bef54a79..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_squeezenet.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-  if (paddle_mobile.Load(g_squeezenet, true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-    std::vector<int64_t> dims{1, 3, 227, 227};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_super.cpp b/mobile/test/net/test_super.cpp
deleted file mode 100644
index 669859f622d3e982e35912d1bef81f43109586bf..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_super.cpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../../src/common/types.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobileConfigInternal config;
-  config.load_when_predict = true;
-
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#else
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-#endif
-  //  paddle_mobile.SetThreadNum(4);
-
-  int max = 10;
-  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile.Load(std::string(g_super) + "/model",
-                                 std::string(g_super) + "/params", true, false,
-                                 1, false);
-
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    // 300 * 300
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 1, 300, 300};
-    GetInput<float>(g_test_image_1x3x224x224, &input, dims);
-    paddle_mobile.Predict(input, dims);
-
-    // 640 * 360 (360P)
-    std::vector<float> input1;
-    std::vector<int64_t> dims1{1, 1, 640, 360};
-    GetInput<float>(g_test_image_1x3x224x224, &input1, dims1);
-    auto time3 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input1, dims1);
-      auto time2 = paddle_mobile::time();
-      std::cout << "640 * 360 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time4 = paddle_mobile::time();
-    std::cout << "640 * 360 predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-
-    // 720 * 480 (480P)
-    std::vector<float> input2;
-    std::vector<int64_t> dims2{1, 1, 720, 480};
-    GetInput<float>(g_test_image_1x3x224x224, &input2, dims2);
-    auto time5 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input2, dims2);
-      auto time2 = paddle_mobile::time();
-      std::cout << "720 * 480 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time6 = paddle_mobile::time();
-    std::cout << "720 * 480 predict cost :"
-              << paddle_mobile::time_diff(time5, time6) / max << "ms"
-              << std::endl;
-
-    // 1024 * 576 (576P)
-    std::vector<float> input3;
-    std::vector<int64_t> dims3{1, 1, 1024, 576};
-    GetInput<float>(g_test_image_1x3x224x224, &input3, dims3);
-    auto time7 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input3, dims3);
-      auto time2 = paddle_mobile::time();
-      std::cout << "1024 * 576 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time8 = paddle_mobile::time();
-    std::cout << "1024 * 576 predict cost :"
-              << paddle_mobile::time_diff(time7, time8) / max << "ms"
-              << std::endl;
-
-    // 1280 * 720
-    std::vector<float> input4;
-    std::vector<int64_t> dims4{1, 1, 1280, 720};
-    GetInput<float>(g_test_image_1x3x224x224, &input4, dims4);
-    auto time9 = paddle_mobile::time();
-    for (int i = 0; i < max; ++i) {
-      auto time1 = paddle_mobile::time();
-      paddle_mobile.Predict(input4, dims4);
-      auto time2 = paddle_mobile::time();
-      std::cout << "1280 * 720 predict cost :第" << i << ": "
-                << paddle_mobile::time_diff(time1, time2) << "ms" << std::endl;
-    }
-    auto time10 = paddle_mobile::time();
-    std::cout << "1280 * 720 predict cost :"
-              << paddle_mobile::time_diff(time9, time10) / max << "ms"
-              << std::endl;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_vgg16ssd.cpp b/mobile/test/net/test_vgg16ssd.cpp
deleted file mode 100644
index 387d6f38ea9185d0563b39defbed928bda0186bf..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_vgg16ssd.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(1);
-  auto time1 = paddle_mobile::time();
-
-  auto isok =
-      paddle_mobile.Load(std::string(g_vgg16_ssd_combined) + "/model",
-                         std::string(g_vgg16_ssd_combined) + "/params", false);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time1) << "ms"
-              << std::endl;
-
-    std::vector<int64_t> dims{1, 3, 300, 300};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 300, 300}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-
-    auto vec_result = paddle_mobile.Predict(input, dims);
-
-    DLOG << vec_result;
-  }
-
-  return 0;
-}
diff --git a/mobile/test/net/test_wrap.cpp b/mobile/test/net/test_wrap.cpp
deleted file mode 100644
index 69f3e785e8b37c7efb3d01daa5bea1cdb3ede82a..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_wrap.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include "io/paddle_mobile_wrap.h"
-
-int main(int argc, char *argv[]) {
-#ifndef PADDLE_MOBILE_FPGA
-  paddle_mobile::wrap::Net *net =
-      new paddle_mobile::wrap::Net(paddle_mobile::wrap::kGPU_CL);
-  net->SetCLPath("/data/local/tmp/bin");
-  net->Load("./checked_model/model", "./checked_model/params", false, false, 1,
-            true);
-  int size = 1 * 3 * 416 * 416;
-  std::vector<int64_t> shape{1, 3, 416, 416};
-  float *data = new float[size];
-  for (int i = 0; i < size; i++) {
-    data[i] = 0.0;
-  }
-  std::ifstream infile;
-  infile.open("input.txt");
-  for (int i = 0; i < size; i++) {
-    infile >> data[i];
-  }
-  infile.close();
-  // input as vector
-  // std::vector<float> data_as_vector(data, data + size);
-  // auto output = net->Predict(data_as_vector, shape);
-  // for (auto item : output) {
-  //     std::cout << item << std::endl;
-  // }
-  // input as float pointer
-  paddle_mobile::wrap::Tensor input(data,
-                                    paddle_mobile::wrap::make_ddim(shape));
-  net->Feed("image", input);
-  net->Predict();
-  auto output = net->Fetch("save_infer_model/scale_0");
-  int output_size = 1;
-  std::cout << "output shape: ";
-  for (int i = 0; i < output->dims().size(); i++) {
-    std::cout << output->dims()[i] << " ";
-    output_size *= output->dims()[i];
-  }
-  std::cout << std::endl;
-  std::cout << "output data: ";
-  for (int i = 0; i < output_size; i++) {
-    std::cout << output->data()[i] << std::endl;
-  }
-#endif
-  return 0;
-}
diff --git a/mobile/test/net/test_yolo.cpp b/mobile/test/net/test_yolo.cpp
deleted file mode 100644
index 40aabe92f1bff84388ddc411b8517536d33ddb01..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_yolo.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-  if (paddle_mobile.Load(std::string(g_yolo) + "/model",
-                         std::string(g_yolo) + "/params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time2) << "ms" << std::endl;
-
-    std::vector<int64_t> dims{1, 3, 227, 227};
-    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 227, 227}, static_cast<float>(0),
-                       static_cast<float>(1));
-
-    std::vector<float> input(input_tensor.data<float>(),
-                             input_tensor.data<float>() + input_tensor.numel());
-    // 预热十次
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time3 = time();
-    for (int i = 0; i < 10; ++i) {
-      paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_yolo_combined.cpp b/mobile/test/net/test_yolo_combined.cpp
deleted file mode 100644
index 5a589878cccf2fddaa3a29d73db7737d71cff722..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_yolo_combined.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  //  ../../../test/models/googlenet
-  //  ../../../test/models/mobilenet
-  auto time1 = time();
-
-  if (paddle_mobile.Load(std::string(g_yolo_vision) + "/model",
-                         std::string(g_yolo_vision) + "/params", true)) {
-    auto time2 = time();
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    std::vector<float> input;
-
-    GetInput<float>(g_test_image_1x3x416x416_vision_yolo_input, &input, dims);
-    std::cout << "input.size():  " << input.size() << std::endl;
-    for (int j = 0; j < 100; ++j) {
-      std::cout << j << " :  " << input[j] << std::endl;
-    }
-    //        // 预热十次
-    //        for (int i = 0; i < 10; ++i) {
-    //            paddle_mobile.Predict(input, dims);
-    //        }
-    auto time3 = time();
-    const vector<float> vector_out = paddle_mobile.Predict(input, dims);
-
-    std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl;
-
-    auto time4 = time();
-    std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms"
-              << std::endl;
-  }
-  return 0;
-}
diff --git a/mobile/test/net/test_yologpu.cpp b/mobile/test/net/test_yologpu.cpp
deleted file mode 100644
index 37f4a7801920eed1722d390002345f3b9ae86036..0000000000000000000000000000000000000000
--- a/mobile/test/net/test_yologpu.cpp
+++ /dev/null
@@ -1,190 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <thread>  // NOLINT
-#include "../../src/common/types.h"
-#include "../../src/io/paddle_test_inference_api.h"
-#include "../test_helper.h"
-#include "../test_include.h"
-void t1() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile_gpu;
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile_cpu;
-  paddle_mobile::PaddleTester<paddle_mobile::CPU> paddle_test_cpu;
-  paddle_mobile::PaddleTester<paddle_mobile::GPU_CL> paddle_test_gpu;
-  printf("cpu time:%f\n", paddle_test_cpu.CaculatePredictTime());
-  std::string path = "/data/local/tmp/bin";
-  printf("gpu time:%f\n", paddle_test_gpu.CaculatePredictTime(&path));
-  //    paddle_mobile.SetThreadNum(4);
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile_gpu.SetCLPath("/data/local/tmp/bin");
-#endif
-  auto time1 = paddle_mobile::time();
-  auto isok =
-      paddle_mobile_gpu.Load(std::string(g_yolo_vision) + "/model",
-                             std::string(g_yolo_vision) + "/params", true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_test_image_1x3x416x416_vision_yolo_input, &input, dims);
-
-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 1;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile_gpu.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-    //        for (float i : vec_result) {
-    //            std::cout << i << std::endl;
-    //        }
-  }
-}
-
-void t2() {
-  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
-  //    paddle_mobile.SetThreadNum(4);
-#ifdef PADDLE_MOBILE_CL
-  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-#endif
-  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
-                                 std::string(g_yolo_mul) + "/params", true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_yolo_img, &input, dims);
-
-    std::vector<float> vec_result;
-    //            = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 10;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-    //        for (float i : vec_result) {
-    //            std::cout << i << std::endl;
-    //        }
-  }
-}
-
-void t3() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  //    paddle_mobile.SetThreadNum(4);
-  // #ifdef PADDLE_MOBILE_CL
-  //  paddle_mobile.SetCLPath("/data/local/tmp/bin");
-  // #endif
-  auto time1 = paddle_mobile::time();
-  auto isok = paddle_mobile.Load(std::string(g_yolo_mul) + "/model",
-                                 std::string(g_yolo_mul) + "/params", true);
-
-  //  auto isok = paddle_mobile.Load(std::string(g_yolo_mul), true);
-  if (isok) {
-    auto time2 = paddle_mobile::time();
-    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
-              << std::endl;
-
-    std::vector<float> input;
-    std::vector<int64_t> dims{1, 3, 416, 416};
-    GetInput<float>(g_yolo_img, &input, dims);
-
-    std::vector<float> vec_result = paddle_mobile.Predict(input, dims);
-
-    auto time3 = paddle_mobile::time();
-    int max = 10;
-    for (int i = 0; i < max; ++i) {
-      vec_result = paddle_mobile.Predict(input, dims);
-    }
-    auto time4 = paddle_mobile::time();
-
-    //    auto time3 = paddle_mobile::time();
-
-    //    for (int i = 0; i < 10; ++i) {
-    //      auto vec_result = paddle_mobile.Predict(input, dims);
-    //    }
-
-    //    auto time4 = paddle_mobile::time();
-
-    std::cout << "predict cost :"
-              << paddle_mobile::time_diff(time3, time4) / max << "ms"
-              << std::endl;
-    std::vector<float>::iterator biggest =
-        std::max_element(std::begin(vec_result), std::end(vec_result));
-    std::cout << " Max element is " << *biggest << " at position "
-              << std::distance(std::begin(vec_result), biggest) << std::endl;
-    //        for (float i : vec_result) {
-    //            std::cout << i << std::endl;
-    //        }
-  }
-}
-
-int main() {
-  //  std::thread th1(t1);
-  //      std::thread th2(t2);
-  //  std::thread th3(t3);
-  std::thread th1(t1);
-  //  th1.join();
-  //      th2.join();
-  //  th3.join();
-  th1.join();
-  return 0;
-}
diff --git a/mobile/test/operators/test_batchnorm_op.cpp b/mobile/test/operators/test_batchnorm_op.cpp
deleted file mode 100644
index 92cb7157c133f140f8b630c1edfe109e26631244..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_batchnorm_op.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/batchnorm_op.h"
-
-namespace paddle_mobile {
-
-void BatchNorm(const framework::Tensor *X, const framework::Tensor *Mean,
-               const framework::Tensor *Var, const framework::Tensor *Scale,
-               const framework::Tensor *Bias, const float eps,
-               framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  const float *m = Mean->data<float>();
-  const float *v = Var->data<float>();
-  const float *s = Scale->data<float>();
-  const float *b = Bias->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  int batch_size = X->dims()[0];
-  int channel = X->dims()[1];
-  int hw = X->dims()[2] * X->dims()[3];
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channel; ++c) {
-      float mean = m[c];
-      float inv_var = 1.f / std::sqrt(v[c] + eps);
-      float scale = s[c];
-      float bias = b[c];
-      const float *input = x + (batch * channel + c) * hw;
-      float *output = y + (batch * channel + c) * hw;
-      for (int j = 0; j < hw; ++j) {
-        output[j] = scale * ((input[j] - mean) * inv_var) + bias;
-      }
-    }
-  }
-}
-
-int TestBatchNormOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  inputs["Mean"] = std::vector<std::string>({"mean"});
-  inputs["Variance"] = std::vector<std::string>({"variance"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  inputs["Bias"] = std::vector<std::string>({"bias"});
-  outputs["Y"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto mean_var = scope.get()->Var("mean");
-  auto mean = mean_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(mean, framework::make_ddim({input_shape[1]}), -10.0, 10.0);
-
-  auto vari_var = scope.get()->Var("variance");
-  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(vari, framework::make_ddim({input_shape[1]}), -10.0, 10.0);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(scale, framework::make_ddim({input_shape[1]}), -10.0,
-                     10.0);
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(bias, framework::make_ddim({input_shape[1]}), -10.0, 10.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  float eps = 1e-6;
-  framework::AttributeMap attrs;
-  attrs["epsilon"].Set<float>(eps);
-  attrs["momentum"].Set<float>(0.f);
-
-  auto *op = new operators::BatchNormOp<CPU, float>(
-      "batch_norm", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  BatchNorm(input, mean, vari, scale, bias, eps, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  TestBatchNormOp({1, 1, 10, 10});
-  TestBatchNormOp({1, 32, 100, 100});
-  return 0;
-}
diff --git a/mobile/test/operators/test_box_coder_op.cpp b/mobile/test/operators/test_box_coder_op.cpp
deleted file mode 100644
index 39b8257e66ee506db3259167755b26f84f8a07af..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_box_coder_op.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/box_coder_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestBoxCoderOp {
- public:
-  explicit TestBoxCoderOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (auto op : ops) {
-        if (op->Type() == "box_coder" &&
-            op->Input("PriorBox")[0] == "concat_0.tmp_0") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input PriorBox is : " << op->Input("PriorBox")[0];
-          DLOG << " Input PriorBoxVar is : " << op->Input("PriorBoxVar")[0];
-          DLOG << " Input TargetBox is : " << op->Input("TargetBox")[0];
-          DLOG << " OutputBox is : " << op->Output("OutputBox")[0];
-          DLOG << " code_type : "
-               << op->GetAttrMap().at("code_type").GetString();
-          std::shared_ptr<operators::BoxCoderOp<Dtype, float>> boxcoder =
-              std::make_shared<operators::BoxCoderOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(boxcoder);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_boxcoder(const Tensor &t1, const Tensor &t2,
-                                           const Tensor &t3) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *prior_box = scope->Var("concat_0.tmp_0");
-    auto tensor_x1 = prior_box->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *prior_box_var = scope->Var("concat_1.tmp_0");
-    auto tensor_x2 = prior_box_var->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *target_box = scope->Var("concat_2.tmp_0");
-    auto tensor_x3 = target_box->GetMutable<LoDTensor>();
-    tensor_x3->ShareDataWith(t3);
-
-    Variable *boxes_output = scope->Var("box_coder_0.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
-    boxes_output_tensor->mutable_data<float>({1, 1917, 4});
-
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> outbox_tensor = std::make_shared<LoDTensor>();
-    outbox_tensor.reset(boxes_output_tensor);
-
-    predict_boxcoder(t1, t2, t3, 0);
-
-    return outbox_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_boxcoder(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-                        int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestBoxCoderOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run BoxCoderOp Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-
-  paddle_mobile::framework::Tensor priorbox;
-  SetupTensor<float>(&priorbox, {1917, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *priorbox_ptr = priorbox.data<float>();
-
-  paddle_mobile::framework::Tensor priorboxvar;
-  SetupTensor<float>(&priorboxvar, {1917, 4}, static_cast<float>(0.1),
-                     static_cast<float>(0.2));
-  auto *priorboxvar_ptr = priorboxvar.data<float>();
-
-  paddle_mobile::framework::Tensor targetbox;
-  SetupTensor<float>(&targetbox, {1, 1917, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *targetbox_ptr = targetbox.data<float>();
-
-  paddle_mobile::framework::TestBoxCoderOp<paddle_mobile::CPU> testBoxCoderOp(
-      program);
-
-  auto output_boxcoder =
-      testBoxCoderOp.predict_boxcoder(priorbox, priorboxvar, targetbox);
-  auto output_boxcoder_ptr = output_boxcoder->data<float>();
-
-  for (int i = 0; i < output_boxcoder->numel(); i++) {
-    DLOG << output_boxcoder_ptr[i];
-  }
-  DLOGF("\n");
-  /// testing 25th bbox.
-  DLOG << "PriorBox**************";
-  DLOG << priorbox_ptr[100];
-  DLOG << priorbox_ptr[101];
-  DLOG << priorbox_ptr[102];
-  DLOG << priorbox_ptr[103];
-  DLOG << "PriorBoxVar**************";
-  DLOG << priorboxvar_ptr[100];
-  DLOG << priorboxvar_ptr[101];
-  DLOG << priorboxvar_ptr[102];
-  DLOG << priorboxvar_ptr[103];
-  DLOG << "TargetBox***************";
-  DLOG << targetbox_ptr[100];
-  DLOG << targetbox_ptr[101];
-  DLOG << targetbox_ptr[102];
-  DLOG << targetbox_ptr[103];
-  DLOG << "OutputBox**************";
-  DLOG << output_boxcoder_ptr[100];
-  DLOG << output_boxcoder_ptr[101];
-  DLOG << output_boxcoder_ptr[102];
-  DLOG << output_boxcoder_ptr[103];
-
-  DLOG << "***********----------------------**************";
-  auto priorbox_w = priorbox_ptr[102] - priorbox_ptr[100];
-  auto priorbox_h = priorbox_ptr[103] - priorbox_ptr[101];
-  auto priorbox_center_x = (priorbox_ptr[100] + priorbox_ptr[102]) / 2;
-  auto priorbox_center_y = (priorbox_ptr[101] + priorbox_ptr[103]) / 2;
-  DLOG << "prior box width : " << priorbox_w;
-  DLOG << "prior box height : " << priorbox_h;
-  DLOG << "prior box center x : " << priorbox_center_x;
-  DLOG << "prior box center y : " << priorbox_center_y;
-  auto target_box_center_x =
-      priorboxvar_ptr[100] * targetbox_ptr[100] * priorbox_w +
-      priorbox_center_x;
-  DLOG << "target_box_center_x : " << target_box_center_x;
-  auto target_box_center_y =
-      priorboxvar_ptr[101] * targetbox_ptr[101] * priorbox_h +
-      priorbox_center_y;
-  DLOG << "target_box_center_y : " << target_box_center_y;
-  auto target_box_width =
-      std::exp(priorboxvar_ptr[102] * targetbox_ptr[102]) * priorbox_w;
-  DLOG << "target_box_width : " << target_box_width;
-  auto target_box_height =
-      std::exp(priorboxvar_ptr[103] * targetbox_ptr[103]) * priorbox_h;
-  DLOG << "target_box_height : " << target_box_height;
-  DLOG << "pre x min : " << target_box_center_x - target_box_width / 2;
-  DLOG << "pre y min : " << target_box_center_y - target_box_height / 2;
-  DLOG << "pre x max : " << target_box_center_x + target_box_width / 2;
-  DLOG << "pre y max : " << target_box_center_y + target_box_height / 2;
-  return 0;
-}
diff --git a/mobile/test/operators/test_cast_op.cpp b/mobile/test/operators/test_cast_op.cpp
deleted file mode 100644
index f330e07eafa1fab3da74b61bbdb29c50450610c9..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_cast_op.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/cast_op.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype>
-void Cast(const framework::Tensor *X, framework::Tensor *Y) {
-  const Itype *x = X->data<Itype>();
-  Otype *y = Y->mutable_data<Otype>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = static_cast<Otype>(x[i]);
-  }
-}
-
-template <typename T>
-int TypeInt() {}
-template <>
-int TypeInt<bool>() {
-  return 0;
-}
-template <>
-int TypeInt<int>() {
-  return 2;
-}
-template <>
-int TypeInt<int64_t>() {
-  return 3;
-}
-template <>
-int TypeInt<float>() {
-  return 5;
-}
-template <>
-int TypeInt<double>() {
-  return 6;
-}
-template <>
-int TypeInt<size_t>() {
-  return 19;
-}
-template <>
-int TypeInt<uint8_t>() {
-  return 20;
-}
-template <>
-int TypeInt<int8_t>() {
-  return 21;
-}
-
-template <typename Itype, typename Otype>
-int TestCastOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, dims, static_cast<Itype>(-100),
-                     static_cast<Itype>(100));
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["in_dtype"].Set<int>(TypeInt<Itype>());
-  attrs["out_dtype"].Set<int>(TypeInt<Otype>());
-  auto *op = new operators::CastOp<CPU, float>("cast", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  Otype *output_cmp_data = output_cmp.mutable_data<Otype>(output->dims());
-  Cast<Itype, Otype>(input, &output_cmp);
-
-  const Otype *output_data = output->data<Otype>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestCastOp<float, int>({1, 100});
-  TestCastOp<float, int>({128, 100});
-
-  TestCastOp<float, int64_t>({1, 100});
-  TestCastOp<float, int64_t>({128, 100});
-
-  TestCastOp<int, float>({1, 100});
-  TestCastOp<int, float>({128, 100});
-
-  TestCastOp<int64_t, float>({1, 100});
-  TestCastOp<int64_t, float>({128, 100});
-  return 0;
-}
diff --git a/mobile/test/operators/test_concat_op.cpp b/mobile/test/operators/test_concat_op.cpp
deleted file mode 100644
index 761d1ac51d6c5c3d47b6eb8ef3695262acb11eb2..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_concat_op.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-#include <iostream>
-#include <vector>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/concat_op.h"
-
-namespace paddle_mobile {
-using framework::AttributeMap;
-using framework::DDim;
-using framework::LoDTensor;
-using framework::Scope;
-using framework::make_ddim;
-
-template <typename T>
-void concat(const std::vector<LoDTensor> &input, LoDTensor *output, int axis) {
-  int num = input.size();
-
-  int rows = 1;
-  auto dim_0 = input[0].dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-
-  std::vector<int> input_cols(input.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = input[i].numel() / rows;
-    out_cols += t_cols;
-    input_cols[i] = t_cols;
-  }
-
-  // computation
-  auto output_data = output->data<T>();
-  int col_idx = 0;
-  for (int j = 0; j < num; ++j) {
-    int col_len = input_cols[j];
-    auto input_data = input[j].data<T>();
-    for (int k = 0; k < out_rows; ++k) {
-      memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len,
-             sizeof(T) * col_len);
-    }
-    col_idx += col_len;
-  }
-}
-
-template <typename T>
-int TestConcatOP() {
-  DDim inputA_shape = make_ddim({10, 4, 2, 2});
-  DDim inputB_shape = make_ddim({20, 4, 2, 2});
-  DDim inputC_shape = make_ddim({30, 4, 2, 2});
-  DDim inputD_shape = make_ddim({40, 4, 2, 2});
-  DDim output_shape = make_ddim({100, 4, 2, 2});
-  int axis_v = 0;
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  std::vector<LoDTensor> input_tensors;
-  auto scope = std::make_shared<Scope>();
-  inputs["X"] =
-      std::vector<std::string>({"inputA", "inputB", "inputC", "inputD"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputA_var = scope.get()->Var("inputA");
-  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputA, inputA_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputA));
-
-  auto inputB_var = scope.get()->Var("inputB");
-  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputB, inputB_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputB));
-
-  auto inputC_var = scope.get()->Var("inputC");
-  auto inputC = inputC_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputC, inputC_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputC));
-
-  auto inputD_var = scope.get()->Var("inputD");
-  auto inputD = inputD_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputD, inputD_shape, -127, 127);
-  input_tensors.push_back(std::move(*inputD));
-
-  auto output_var = scope.get()->Var("output");
-  AttributeMap attrs;
-  attrs["axis"].Set<int>(axis_v);
-
-  auto *op = new operators::ConcatOp<CPU, float>("concat", inputs, outputs,
-                                                 attrs, scope.get());
-  op->InferShape();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const T *output_data = output->data<T>();
-  LoDTensor output_cmp;
-  output_cmp.mutable_data<T>(output_shape);
-  concat<T>(input_tensors, &output_cmp, axis_v);
-  const T *output_cmp_data = output_cmp.data<T>();
-  // compare
-  int eq = 0;
-  int neq = 0;
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "The execution of test_concat_op is failed!");
-    if (output_data[i] == output_cmp_data[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
-
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  paddle_mobile::TestConcatOP<float>();
-  paddle_mobile::TestConcatOP<int8_t>();
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_add_relu_op.cpp b/mobile/test/operators/test_conv_add_relu_op.cpp
deleted file mode 100644
index f170719218b98d341985a61ca6160884afe4ad3b..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_conv_add_relu_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/fusion_conv_add_relu_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_googlenet, true);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<
-      paddle_mobile::CPU,
-      paddle_mobile::operators::FusionConvAddReluOp<paddle_mobile::CPU, float>>
-      executor(program, "fusion_conv_add_relu", true);
-
-  paddle_mobile::framework::Tensor input;
-  GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
-  //  // use SetupTensor if not has local input image .
-  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
-  //                     static_cast<float>(1));
-
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 112, 112});
-  auto output = executor.Predict(input, "data", "conv2d_0.tmp_2", out_ddim);
-
-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < 25; ++j) {
-    DLOG << " value of output: " << output_ptr[j];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_bn_relu_op.cpp b/mobile/test/operators/test_conv_bn_relu_op.cpp
deleted file mode 100644
index b51bdc07375d8dded738cf022781bc3c14d11d44..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_conv_bn_relu_op.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/fusion_conv_bn_relu_op.h"
-
-namespace paddle_mobile {
-
-// Reference convolution from Caffe for checking results.
-// accumulate through explicit loops over input, output, and filters.
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvBnReluOp(int in_channels, int in_height, int in_width,
-                     int out_channels, int groups, std::string opname) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-  framework::DDim shape = framework::make_ddim({output_c});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  inputs["Mean"] = std::vector<std::string>({"input_mean"});
-  inputs["Variance"] = std::vector<std::string>({"input_variance"});
-  inputs["Scale"] = std::vector<std::string>({"input_scale"});
-  inputs["Bias"] = std::vector<std::string>({"input_bias"});
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(filter, filter_shape, -20, 20);
-
-  auto input_mean_var = scope.get()->Var("input_mean");
-  auto input_mean = input_mean_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input_mean, shape, -10.0, 10.0);
-  auto vari_var = scope.get()->Var("input_variance");
-  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(vari, shape, -10.0, 10.0);
-  auto scale_var = scope.get()->Var("input_scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(scale, shape, -10.0, 10.0);
-  auto input_bias_var = scope.get()->Var("input_bias");
-  auto input_bias = input_bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input_bias, shape, -10.0, 10.0);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-  attrs["epsilon"].Set<float>(1e-6);
-  attrs["momentum"].Set<float>(0.f);
-  auto *op = new operators::FusionConvBNReluOp<CPU, float>(
-      "fusion_conv_bn_relu", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time1 = time();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time2 = time();
-  std::ofstream out_file("./out_conv.txt", std::ios::app);
-  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
-           << std::endl;
-  out_file.close();
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  // kernel = 3, pad = 1, stride = 2
-  paddle_mobile::TestConvBnReluOp<float, float, 3, 1, 2>(3, 48, 48, 16, 1,
-                                                         "conv_bn_relu");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(16, 24, 24, 8, 1,
-                                                         "depthwise_seperable");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
-                                                         "MBConv_3x3_conv1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 24, 24, 8, 1,
-                                                         "MBConv_3x3_pw1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
-                                                         "MBConv_3x3_conv2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 24, 24, 8, 1,
-                                                         "MBConv_3x3_pw2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
-                                                         "MBConv_3x3_conv3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 12, 12, 16, 1,
-                                                         "MBConv_3x3_pw3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      48, 6, 6, 32, 1, "MBConv_5x5_stage1_pw3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw1");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw2");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv3");
-  // kernel = 1, pad = 0, stride = 1
-  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
-      192, 6, 6, 64, 1, "MBConv_5x5_stage2_pw3");
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_gpu.cpp b/mobile/test/operators/test_conv_gpu.cpp
deleted file mode 100644
index f9b1782b773c855375971ed9ddfa975a95155e06..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_conv_gpu.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_CL
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "common/common.h"
-#include "framework/cl/cl_helper.h"
-#include "framework/cl/cl_image.h"
-#include "operators/conv_op.h"
-#include "operators/kernel/cl/cl-kernel-func/conv_func.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
-               int groups) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-
-  //    std::cerr << " init " << std::endl;
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  outputs["Output"] = std::vector<std::string>({"output"});
-  cl_context context = scope->GetCLScpoe()->Context();
-  cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
-
-  //    std::cerr << " input " << std::endl;
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::CLImage>();
-  const int in_numel = framework::product(input_shape);
-  float *in_data = new float[in_numel];
-  for (int i = 0; i < in_numel; ++i) {
-    in_data[i] = (i % 36 / 6) + 1;
-  }
-  input->SetTensorData(in_data, input_shape);
-  input->InitNormalCLImage(context, command_queue);
-  DLOG << "input image \n" << *input;
-
-  //    std::cerr << " filter " << std::endl;
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::CLImage>();
-  const int filter_numel = product(filter_shape);
-  float *filter_data = new float[filter_numel];
-  for (int i = 0; i < filter_numel; ++i) {
-    filter_data[i] = i % 9;
-  }
-  filter->SetTensorData(filter_data, filter_shape);
-
-  //    std::cerr << " attrs " << std::endl;
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-
-  std::cerr << " output " << std::endl;
-  auto output_var = scope.get()->Var("output");
-  auto output = output_var->template GetMutable<framework::CLImage>();
-
-  auto *op = new operators::ConvOp<GPU_CL, float>("conv2d", inputs, outputs,
-                                                  attrs, scope.get());
-
-  op->InferShape();
-
-  framework::DDim ddim = output->dims();
-
-  DLOG << "output dims = " << ddim;
-  output->InitEmptyImage(context, command_queue, ddim);
-
-  //    std::cerr << " op->init " << std::endl;
-  op->Init();
-
-  auto time1 = time();
-  op->Run();
-  auto time2 = time();
-  std::cerr << "time cost : " << time_diff(time1, time2) << std::endl;
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int TestAll(const int in_channels, const int in_height, const int in_width,
-            const int out_channels, const int groups) {
-  std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height
-            << ", in_width=" << in_width << ", out_channels=" << out_channels
-            << ", groups=" << groups << std::endl;
-  std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  return 0;
-}
-#endif
-
-int main() {
-  //  TestAll(4, 6, 6, 4, 1);
-  //  TestAll(6, 32, 32, 24, 1);
-  //    TestAll(12, 32, 32, 24, 1);
-  //    TestAll(24, 32, 32, 24, 1);
-  //    TestAll(36, 32, 32, 24, 1);
-  //    TestAll(48, 32, 32, 24, 1);
-  //    TestAll(60, 32, 32, 24, 1);
-  //    TestAll(72, 32, 32, 24, 1);
-  //    TestAll(116, 32, 32, 24, 1);
-  //    TestAll(232, 32, 32, 24, 1);
-  //    TestAll(464, 32, 32, 24, 1);
-  //
-  //    TestAll(6, 64, 64, 24, 1);
-  //    TestAll(12, 64, 64, 24, 1);
-  //    TestAll(24, 64, 64, 24, 1);
-  //    TestAll(36, 64, 64, 24, 1);
-  //    TestAll(48, 64, 64, 24, 1);
-  //    TestAll(60, 64, 64, 24, 1);
-  //    TestAll(72, 64, 64, 24, 1);
-  //    TestAll(116, 64, 64, 24, 1);
-  //    TestAll(232, 64, 64, 24, 1);
-  //    TestAll(464, 64, 64, 24, 1);
-  //
-  //    TestAll(6, 128, 128, 24, 1);
-  //    TestAll(12, 128, 128, 24, 1);
-  //    TestAll(24, 128, 128, 24, 1);
-  //    TestAll(36, 128, 128, 24, 1);
-  //    TestAll(48, 128, 128, 24, 1);
-  //    TestAll(60, 128, 128, 24, 1);
-  //    TestAll(72, 128, 128, 24, 1);
-  //    TestAll(116, 128, 128, 24, 1);
-  //    TestAll(232, 128, 128, 24, 1);
-  //    TestAll(464, 128, 128, 24, 1);
-  //
-  //
-  //    TestAll(6, 32, 32, 6, 1);
-  //    TestAll(12, 32, 32, 12, 1);
-  //    TestAll(24, 32, 32, 24, 1);
-  //    TestAll(36, 32, 32, 36, 1);
-  //    TestAll(48, 32, 32, 48, 1);
-  //    TestAll(60, 32, 32, 60, 1);
-  //    TestAll(72, 32, 32, 72, 1);
-  //    TestAll(116, 32, 32, 116, 1);
-  //    TestAll(232, 32, 32, 232, 1);
-  //    TestAll(464, 32, 32, 464, 1);
-  //
-  //    TestAll(6, 64, 64, 6, 1);
-  //    TestAll(12, 64, 64, 12, 1);
-  //    TestAll(24, 64, 64, 24, 1);
-  //    TestAll(36, 64, 64, 36, 1);
-  //    TestAll(48, 64, 64, 48, 1);
-  //    TestAll(60, 64, 64, 60, 1);
-  //    TestAll(72, 64, 64, 72, 1);
-  //    TestAll(116, 64, 64, 116, 1);
-  //    TestAll(232, 64, 64, 232, 1);
-  //    TestAll(464, 64, 64, 464, 1);
-  //
-  //    TestAll(6, 128, 128, 6, 1);
-  //    TestAll(12, 128, 128, 12, 1);
-  //    TestAll(24, 128, 128, 24, 1);
-  //    TestAll(36, 128, 128, 36, 1);
-  //    TestAll(48, 128, 128, 48, 1);
-  //    TestAll(60, 128, 128, 60, 1);
-  //    TestAll(72, 128, 128, 72, 1);
-  //    TestAll(116, 128, 128, 116, 1);
-  //    TestAll(232, 128, 128, 232, 1);
-  //    TestAll(464, 128, 128, 464, 1);
-  return 0;
-}
diff --git a/mobile/test/operators/test_conv_op.cpp b/mobile/test/operators/test_conv_op.cpp
deleted file mode 100644
index c705e162fec91bebc8eb25008b66b979a7d70c06..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_conv_op.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/conv_op.h"
-
-namespace paddle_mobile {
-
-// Reference convolution from Caffe for checking results.
-// accumulate through explicit loops over input, output, and filters.
-template <typename Itype, typename Otype>
-void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
-            const framework::AttributeMap &attrs, framework::Tensor *output) {
-  framework::AttrReader attr_reader(attrs);
-  std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
-  std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
-  std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
-  int groups = attr_reader.Get<int>("groups");
-  int kernel_h = filter->dims()[2];
-  int kernel_w = filter->dims()[3];
-  int pad_h = paddings[0];
-  int pad_w = paddings[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int dilation_h = dilations[0];
-  int dilation_w = dilations[1];
-  auto in_shape = input->dims();
-  auto out_shape = output->dims();
-
-  const bool has_depth = 0;
-  int kernel_d, pad_d, stride_d, dilation_d;
-  if (has_depth) {
-    kernel_d = kernel_h;
-    stride_d = stride_h;
-    pad_d = pad_h;
-    dilation_d = dilation_h;
-  } else {
-    kernel_d = stride_d = dilation_d = 1;
-    pad_d = 0;
-  }
-  // Groups
-  int o_g = out_shape[1] / groups;
-  int k_g = in_shape[1] / groups;
-  int o_head, k_head;
-  // Convolution
-  vector<int> weight_offset(4 + has_depth);
-  vector<int> in_offset(4 + has_depth);
-  vector<int> out_offset(4 + has_depth);
-  auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
-    framework::DDim shape = input->dims();
-    size_t count = 0;
-    for (int i = 0; i < indics.size(); ++i) {
-      count *= shape[i];
-      count += indics[i];
-    }
-    return count;
-  };
-
-  const Itype *in_data = input->data<Itype>();
-  const Itype *w_data = filter->data<Itype>();
-  Otype *out_data = output->mutable_data<Otype>();
-  memset(out_data, 0, output->numel() * sizeof(Otype));
-  for (int n = 0; n < out_shape[0]; n++) {
-    for (int g = 0; g < groups; g++) {
-      o_head = o_g * g;
-      k_head = k_g * g;
-      for (int o = 0; o < o_g; o++) {
-        for (int k = 0; k < k_g; k++) {
-          for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
-            for (int y = 0; y < out_shape[2 + has_depth]; y++) {
-              for (int x = 0; x < out_shape[3 + has_depth]; x++) {
-                for (int r = 0; r < kernel_d; r++) {
-                  for (int p = 0; p < kernel_h; p++) {
-                    for (int q = 0; q < kernel_w; q++) {
-                      int in_z = z * stride_d - pad_d + r * dilation_d;
-                      int in_y = y * stride_h - pad_h + p * dilation_h;
-                      int in_x = x * stride_w - pad_w + q * dilation_w;
-                      if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
-                          in_y >= 0 && in_y < in_shape[2 + has_depth] &&
-                          in_x >= 0 && in_x < in_shape[3 + has_depth]) {
-                        weight_offset[0] = o + o_head;
-                        weight_offset[1] = k;
-                        if (has_depth) {
-                          weight_offset[2] = r;
-                        }
-                        weight_offset[2 + has_depth] = p;
-                        weight_offset[3 + has_depth] = q;
-                        in_offset[0] = n;
-                        in_offset[1] = k + k_head;
-                        if (has_depth) {
-                          in_offset[2] = in_z;
-                        }
-                        in_offset[2 + has_depth] = in_y;
-                        in_offset[3 + has_depth] = in_x;
-                        out_offset[0] = n;
-                        out_offset[1] = o + o_head;
-                        if (has_depth) {
-                          out_offset[2] = z;
-                        }
-                        out_offset[2 + has_depth] = y;
-                        out_offset[3 + has_depth] = x;
-
-                        out_data[offset(output, out_offset)] +=
-                            in_data[offset(input, in_offset)] *
-                            w_data[offset(filter, weight_offset)];
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestConvOp(int in_channels, int in_height, int in_width, int out_channels,
-               int groups) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  outputs["Output"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(filter, filter_shape, -20, 20);
-
-  //  for (int i = 0; i < input->numel(); ++i) {
-  //    DLOG << "input[" << i << "] = " << float(input->data<Itype>()[i]);
-  //  }
-  //  for (int i = 0; i < filter->numel(); ++i) {
-  //    DLOG << "filter[" << i << "] = " << float(filter->data<Itype>()[i]);
-  //  }
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-
-  auto *op = new operators::ConvOp<CPU, float>("conv2d", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  //  struct timespec ts_begin, ts_end;
-  // warmup
-  //  op->Run();
-  //  clock_gettime(CLOCK_MONOTONIC, &ts_begin);
-  //  for (int i = 0; i < 10; ++i) {
-  op->Run();
-  //  }
-  //  clock_gettime(CLOCK_MONOTONIC, &ts_end);
-  //  uint64_t elapsed = (ts_end.tv_sec - ts_begin.tv_sec) * 1e3 +
-  //                     (ts_end.tv_nsec - ts_begin.tv_nsec) / 1e6;
-  //  LOG(kLOG_INFO) << "elapsed: " << elapsed / 10.0 << " ms";
-
-  // compare results
-  auto *output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  output_cmp.mutable_data<Otype>(output->dims());
-  conv2d<Itype, Otype>(input, filter, attrs, &output_cmp);
-
-  const Otype *output_data = output->data<Otype>();
-  Otype *output_cmp_data = output_cmp.data<Otype>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = abs(output_data[i] - output_cmp_data[i]);
-    //    PADDLE_MOBILE_ENFORCE(std::abs(gap / (output_data[i] + 1e-5)) < 1e-3,
-    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
-    //                          output_data[i], i, output_cmp_data[i]);
-    if (gap > 1e-2 && (gap / (abs(output_data[i]) + 1e-5) > 1e-2)) {
-      std::cerr << "output_data[" << i << "] = " << output_data[i]
-                << ", output_cmp_data[" << i << "] = " << output_cmp_data[i]
-                << std::endl;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int TestAll(const int in_channels, const int in_height, const int in_width,
-            const int out_channels, const int groups) {
-  std::cerr << "in_channels=" << in_channels << ", in_height=" << in_height
-            << ", in_width=" << in_width << ", out_channels=" << out_channels
-            << ", groups=" << groups << std::endl;
-  std::cerr << "float, kernel=1, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 1, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  // kernel = 3, pad = 0, stride = 1
-  std::cerr << "float, kernel=3, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 1
-  std::cerr << "float, kernel=3, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 1
-  std::cerr << "float, kernel=3, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 1
-  std::cerr << "float, kernel=3, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  // kernel = 3, pad = 0, stride = 2
-  std::cerr << "float, kernel=3, pad=0, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 0, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 2
-  std::cerr << "float, kernel=3, pad=1, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 1, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 2
-  std::cerr << "float, kernel=3, pad=2, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 2, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 2
-  std::cerr << "float, kernel=3, pad=5, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 3, 5, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-#ifndef __aarch64__
-  // kernel = 3, pad = 0, stride = 1
-  std::cerr << "int8, kernel=3, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 1
-  std::cerr << "int8, kernel=3, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 1
-  std::cerr << "int8, kernel=3, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 1
-  std::cerr << "int8, kernel=3, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-  // kernel = 3, pad = 0, stride = 2
-  std::cerr << "int8, kernel=3, pad=0, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 0, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 1, stride = 2
-  std::cerr << "int8, kernel=3, pad=1, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 1, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 2, stride = 2
-  std::cerr << "int8, kernel=3, pad=2, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 2, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 3, pad = 5, stride = 2
-  std::cerr << "int8, kernel=3, pad=5, stride=2" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 3, 5, 2>(
-      in_channels, in_height, in_width, out_channels, groups);
-#endif  // __aarch64__
-
-  // kernel = 5, pad = 0, stride = 1
-  std::cerr << "float, kernel=5, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 1, stride = 1
-  std::cerr << "float, kernel=5, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 2, stride = 1
-  std::cerr << "float, kernel=5, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 5, stride = 1
-  std::cerr << "float, kernel=5, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<float, float, 5, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-
-#ifndef __aarch64__
-  // kernel = 5, pad = 0, stride = 1
-  std::cerr << "int8, kernel=5, pad=0, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 0, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 1, stride = 1
-  std::cerr << "int8, kernel=5, pad=1, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 1, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 2, stride = 1
-  std::cerr << "int8, kernel=5, pad=2, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 2, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-  // kernel = 5, pad = 5, stride = 1
-  std::cerr << "int8, kernel=5, pad=5, stride=1" << std::endl;
-  paddle_mobile::TestConvOp<int8_t, int32_t, 5, 5, 1>(
-      in_channels, in_height, in_width, out_channels, groups);
-#endif  // __aarch64__
-
-  return 0;
-}
-
-int main() {
-  TestAll(16, 10, 10, 16, 16);
-  TestAll(1, 5, 5, 1, 1);
-  TestAll(1, 5, 5, 10, 1);
-  TestAll(10, 5, 5, 10, 10);
-
-  TestAll(5, 33, 33, 5, 1);
-  TestAll(5, 33, 33, 13, 1);
-  TestAll(13, 33, 33, 13, 13);
-
-  TestAll(5, 33, 13, 5, 1);
-  TestAll(5, 33, 13, 13, 1);
-  TestAll(13, 33, 13, 13, 13);
-  return 0;
-}
diff --git a/mobile/test/operators/test_depthwise_conv_op.cpp b/mobile/test/operators/test_depthwise_conv_op.cpp
deleted file mode 100644
index 77c76eedc5690412dfee95dd11e8a3fe9ed6ecbe..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_depthwise_conv_op.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/depthwise_conv_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet_ssd);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::DepthwiseConvOp<
-                                        paddle_mobile::CPU, float>>
-      executor(program, "depthwise_conv2d");
-
-  paddle_mobile::framework::LoDTensor input;
-  // GetInput<float>(g_test_image_1x3x224x224, &input, {1, 3, 224, 224});
-  // use SetupTensor if not has local input image .
-  SetupTensor<float>(&input, {1, 32, 150, 150}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 150, 150});
-  auto output = executor.Predict(input, "batch_norm_0.tmp_3",
-                                 "depthwise_conv2d_0.tmp_0", out_ddim);
-
-  auto output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_dequantize_op.cpp b/mobile/test/operators/test_dequantize_op.cpp
deleted file mode 100644
index 981439c66fdf9902b4451a322099f2e885a1b396..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_dequantize_op.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/dequantize_op.h"
-
-namespace paddle_mobile {
-
-void dequantize(const Tensor* input, const float scale, Tensor* output) {
-  const int32_t* x = input->data<int32_t>();
-  float* y = output->mutable_data<float>();
-  size_t size = output->numel();
-  for (size_t i = 0; i < size; ++i) {
-    y[i] = x[i] * scale;
-  }
-}
-
-int TestDequqntizeOp() {
-  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<int32_t>(input, dim, -1000, 1000);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  scale->Resize(framework::make_ddim({1}));
-  scale->mutable_data<float>()[0] = 1.27;
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["weight_scale"].Set<float>(1.74);
-
-  auto* op = new operators::DequantizeOp<CPU, float>(
-      "dequantize", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const float* output_data = output->data<float>();
-
-  framework::Tensor output_cmp;
-  output_cmp.Resize(dim);
-  float dequant_scale = 1.27 / 1.74;
-  dequantize(input, dequant_scale, &output_cmp);
-  const float* output_cmp_data = output_cmp.data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "output[%d] = %.6f, output_cmp[%d] = %.6f", i,
-                          output_data[i], i, output_cmp_data[i]);
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() { return paddle_mobile::TestDequqntizeOp(); }
diff --git a/mobile/test/operators/test_dwconv_bn_relu_op.cpp b/mobile/test/operators/test_dwconv_bn_relu_op.cpp
deleted file mode 100644
index 8b2e6f06b2295c7716d2b4b6fc4cdd156668e2c8..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_dwconv_bn_relu_op.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/fusion_dwconv_bn_relu_op.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
-int TestDWConvAddBnReluOp(int in_channels, int in_height, int in_width,
-                          int out_channels, int groups, std::string opname) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  int dilation_h = 1;
-  int dilation_w = 1;
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-  int output_c = out_channels;
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-  framework::DDim filter_shape =
-      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
-  framework::DDim shape = framework::make_ddim({output_c});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Filter"] = std::vector<std::string>({"filter"});
-  inputs["Mean"] = std::vector<std::string>({"mean"});
-  inputs["Variance"] = std::vector<std::string>({"variance"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  inputs["Bias"] = std::vector<std::string>({"bias"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
-
-  auto filter_var = scope.get()->Var("filter");
-  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(filter, filter_shape, -20, 20);
-
-  auto mean_var = scope.get()->Var("mean");
-  auto mean = mean_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(mean, shape, -10.0, 10.0);
-
-  auto vari_var = scope.get()->Var("variance");
-  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(vari, shape, -10.0, 10.0);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(scale, shape, -10.0, 10.0);
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(bias, shape, -10.0, 10.0);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["dilations"].Set<vector<int>>(
-      std::vector<int>({dilation_h, dilation_w}));
-  attrs["groups"].Set<int>(groups);
-  attrs["epsilon"].Set<float>(1e-6);
-  attrs["momentum"].Set<float>(0.f);
-
-  auto *op = new operators::FusionDWConvBNReluOp<CPU, float>(
-      "fusion_dwconv_bn_relu", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time1 = time();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time2 = time();
-  std::ofstream out_file("./out_dwconv.txt", std::ios::app);
-  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
-           << std::endl;
-  out_file.close();
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  // kernel = 3, pad = 1, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
-      16, 24, 24, 16, 16, "depthwise_seperable");
-  // kernel = 3, pad = 1, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
-      24, 24, 24, 24, 24, "MBConv_3x3_dw1");
-  // kernel = 3, pad = 1, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
-      24, 24, 24, 24, 24, "MBConv_3x3_dw2");
-  // kernel = 3, pad = 1, stride = 2
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 2>(
-      24, 24, 24, 24, 24, "MBConv_3x3_dw3");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw1");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw2");
-  // kernel = 5, pad = 2, stride = 2
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 2>(
-      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw3");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw1");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw2");
-  // kernel = 5, pad = 2, stride = 1
-  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
-      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw3");
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_elementwise_add_op.cpp b/mobile/test/operators/test_elementwise_add_op.cpp
deleted file mode 100644
index 3922b216cfc6ecf55be251ded02c0c064e2c3ffc..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_elementwise_add_op.cpp
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_resnet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::ElementwiseAddOp<
-                                        paddle_mobile::CPU, float>>
-      executor(program, "elementwise_add");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {1, 3, 224, 224}, 0, 1);
-  input_tensors.push_back(input1);
-
-  Tensor input2;
-  auto input2_data = CreateInput<float>(&input2, {224}, 0, 1);
-  input_tensors.push_back(input2);
-
-  // 2. input_names
-  vector<string> input_names({
-      "batch_norm_2.tmp_2",
-      "batch_norm_0.tmp_3",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"elementwise_add_0.tmp_0"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 224, 224});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-  /// output (1,3,224,224)
-  DLOG << "output memory size : " << output[0]->memory_size();
-  DLOG << "output numel : " << output[0]->numel();
-
-  DLOG << input1_data[226] << " + " << input2_data[2] << " = "
-       << output0_data[226];
-}
diff --git a/mobile/test/operators/test_elementwise_sub_op.cpp b/mobile/test/operators/test_elementwise_sub_op.cpp
deleted file mode 100644
index d07d42849b1f83b3cd30b969e820d42053ba81ea..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_elementwise_sub_op.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/elementwise_sub_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestElementwiseSubOp {
- public:
-  explicit TestElementwiseSubOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "elementwise_sub" &&
-            op->Input("X")[0] == "sigmoid_1.tmp_0") {
-          DLOG << " elementwise_sub attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-
-          std::shared_ptr<operators::ElementwiseSubOp<Dtype, float>> lrn =
-              std::make_shared<operators::ElementwiseSubOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("sigmoid_1.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *output = scope->Var("tmp_1");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({1, 1, 6, 6});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_bn(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestElementwiseSubOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run ElementwiseSub Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  /// input x1 (1,1,6,6)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {1, 1, 6, 6}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-
-  /// input x2 (1,1,6,6)
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {1, 1, 6, 6}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-
-  paddle_mobile::framework::TestElementwiseSubOp<paddle_mobile::CPU>
-      testElementwiseSubOp(program);
-
-  auto output_op = testElementwiseSubOp.predict_bn(inputx1, inputx2);
-  auto *output_op_ptr = output_op->data<float>();
-
-  auto inputx1_dim = inputx1.numel() / inputx1.dims()[0];
-  DLOG << " input1 : ";
-  for (int i = 0; i < inputx1.dims()[0]; ++i) {
-    for (int j = 0; j < inputx1_dim; ++j) {
-      DLOGF("%f ", inputx1_ptr[i * inputx1_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto inputx2_dim = inputx2.numel() / inputx2.dims()[0];
-  DLOG << " input2 : ";
-  for (int i = 0; i < inputx2.dims()[0]; ++i) {
-    for (int j = 0; j < inputx2_dim; ++j) {
-      DLOGF("%f ", inputx2_ptr[i * inputx2_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto output_dim = output_op->numel() / output_op->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output_op->dims()[0]; ++i) {
-    for (int j = 0; j < output_dim; ++j) {
-      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp
deleted file mode 100644
index cbe307ac696b1ced89fdc644590f6a83cb56b644..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_expend_op.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_MOBILE_CL
-#include "../executor_for_test_opencl.h"
-#include "operators/expand_op.h"
-#include "operators/feed_op.h"
-#ifdef EXPAND_OP
-
-int main() {
-  const int IN_N = 1;
-  const int IN_C = 1;
-  const int IN_H = 2;
-  const int IN_W = 3;
-
-  const int EXPEND_N = 1;
-  const int EXPEND_C = 1;
-  const int EXPEND_H = 2;
-  const int EXPEND_W = 2;
-
-  const int OUT_N = IN_N * EXPEND_N;
-  const int OUT_C = IN_C * EXPEND_C;
-  const int OUT_H = IN_H * EXPEND_H;
-  const int OUT_W = IN_W * EXPEND_W;
-
-  framework::DDim in_dims = framework::make_ddim({IN_N, IN_C, IN_H, IN_W});
-  framework::DDim out_dims = framework::make_ddim({OUT_N, OUT_C, OUT_H, OUT_W});
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  AttributeMap attrs;
-  inputs["X"] = std::vector<std::string>({"op_in"});
-  outputs["Out"] = std::vector<std::string>({"op_out"});
-
-  std::vector<int> expand_times = {EXPEND_N, EXPEND_C, EXPEND_H, EXPEND_W};
-  attrs["expand_times"].Set<std::vector<int>>(expand_times);
-
-  OpenClOpTester<operators::ExpandOp<GPU_CL, float>> tester;
-  tester.Predict("expend", in_dims, out_dims, inputs, outputs, attrs);
-}
-#endif
-
-#else
-int main() {}
-#endif
diff --git a/mobile/test/operators/test_fill_constant_op.cpp b/mobile/test/operators/test_fill_constant_op.cpp
deleted file mode 100644
index 86a4bf0a3713bb4b8f1301ca9d0acf68f140060c..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_fill_constant_op.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/fill_constant_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestFillConstantOp {
- public:
-  explicit TestFillConstantOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "fill_constant") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
-          for (std::unordered_map<std::string, Attribute>::iterator it =
-                   attrs.begin();
-               it != attrs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " output is : " << op->Output("Out")[0];
-          output_var_name = op->Output("Out")[0];
-          std::shared_ptr<operators::FillConstantOp<Dtype, float>> op_ptr =
-              std::make_shared<operators::FillConstantOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict() {
-    auto scope = program_.scope.get();
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string output_var_name;
-
-  void predict(int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestFillConstantOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run FillConstant Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  paddle_mobile::framework::TestFillConstantOp<paddle_mobile::CPU>
-      testFillConstantOp(program);
-
-  auto output = testFillConstantOp.predict();
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp b/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp
deleted file mode 100644
index 347bcb40a6156a576842af34920bde838dd83cd8..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_fusion_conv_add_bn_relu_op.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/fusion_conv_add_bn_relu_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //  ../models/image_classification_resnet.inference.model
-  auto program = loader.Load(g_mobilenet, true);
-
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::FusionConvAddBNReluOp<
-                    paddle_mobile::CPU, float>>
-      executor(program, "fusion_conv_add_bn_relu", true);
-
-  std::cout << "executor 4 test: " << std::endl;
-
-  paddle_mobile::framework::Tensor input;
-  GetInput<float>(g_test_image_1x3x224x224_banana, &input, {1, 3, 224, 224});
-  //  // use SetupTensor if not has local input image .
-  //  SetupTensor<float>(&input, {1, 3, 224, 224}, static_cast<float>(0),
-  //                     static_cast<float>(1));
-
-  DLOG << " fuck: " << input;
-
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 32, 112, 112});
-  std::cout << "before predict: " << std::endl;
-  auto output =
-      executor.Predict(input, "data", "conv2_1_dw_bn.tmp_2", out_ddim);
-  std::cout << "after predict " << std::endl;
-  auto output_ptr = output->data<float>();
-
-  int stride = output->numel() / 100;
-  for (int i = 0; i < 100; i++) {
-    DLOG << " index:" << i * stride << " value: " << output_ptr[i * stride];
-  }
-
-  //  for (int i = 0; i < 100; i++) {
-  //    DLOG << " index:" << i << " value: "<< output_ptr[i];
-  //  }
-
-  //  for (int j = 0; j < output->numel(); ++j) {
-  //    std::cout << " (index: " << j << " value: " << output_ptr[j] << ") ";
-  //  }
-  std::cout << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_fusion_fc_op.cpp b/mobile/test/operators/test_fusion_fc_op.cpp
deleted file mode 100644
index 60ed4976ec3c239a36f1fd5cc45e527ea0457262..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_fusion_fc_op.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include <type_traits>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "framework/operator.h"
-#include "operators/fusion_fc_op.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-
-namespace paddle_mobile {
-using framework::AttributeMap;
-using framework::DDim;
-using framework::Scope;
-using framework::make_ddim;
-
-int32_t qadd_int32(int32_t l, int32_t r) {
-  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
-  if (res > std::numeric_limits<int32_t>::max())
-    return std::numeric_limits<int32_t>::max();
-  else if (res < std::numeric_limits<int32_t>::min())
-    return std::numeric_limits<int32_t>::min();
-  else
-    return static_cast<int32_t>(res);
-}
-
-// round to zero
-float round2zero(float v) {
-  float res;
-  if (v > 0)
-    res = std::floor(v);
-  else if (v < 0)
-    res = std::ceil(v);
-  return res;
-}
-
-int8_t qscale_int32(int32_t v, float scale) {
-  float res = static_cast<float>(v) * scale;
-  res = round2zero(res);
-  if (res > 127)
-    return static_cast<int8_t>(127);
-  else if (res < -127)
-    return static_cast<int8_t>(-127);
-  else
-    return static_cast<int8_t>(res);
-}
-
-template <typename T, typename S>
-int TestFcOP() {
-  int32_t m = 377;
-  int32_t n = 1363;
-  int32_t k = 577;
-  int32_t lda = k;
-  int32_t ldb = n;
-  int32_t ldc = n;
-  DDim inputA_shape = make_ddim({m, k});
-  DDim inputB_shape = make_ddim({k, n});
-  DDim bias_shape = make_ddim({n});
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<Scope>();
-  inputs["X"] = std::vector<std::string>({"inputA"});
-  inputs["Y"] = std::vector<std::string>({"inputB"});
-  inputs["Z"] = std::vector<std::string>({"bias"});
-  inputs["Scale"] = std::vector<std::string>({"scale"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputA_var = scope.get()->Var("inputA");
-  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputA, inputA_shape, -127, 127);
-  auto inputB_var = scope.get()->Var("inputB");
-  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputB, inputB_shape, -127, 127);
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<S>(bias, bias_shape, -127, 127);
-
-  framework::Tensor origin_matrix;
-  T *origin_inputB_ptr = origin_matrix.mutable_data<T>(inputB_shape);
-  memcpy(origin_inputB_ptr, inputB->data<T>(),
-         sizeof(*origin_inputB_ptr) * k * n);
-
-  auto scale_var = scope.get()->Var("scale");
-  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
-  scale->Resize(framework::make_ddim({1}));
-  float scale_v = 0.000828f;
-  scale->mutable_data<float>()[0] = scale_v;
-
-  auto output_var = scope.get()->Var("output");
-  AttributeMap attrs;
-  attrs["x_num_col_dims"].Set<int>(1);
-  attrs["y_num_col_dims"].Set<int>(1);
-  attrs["axis"].Set<int>(1);
-  operators::OperatorBase<CPU> *op = nullptr;
-  op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
-                                         scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const T *output_data = output->data<T>();
-  // compare
-  T *c = static_cast<T *>(memory::Alloc(sizeof(T) * m * n));
-  T *a = inputA->data<T>();
-  T *b = origin_inputB_ptr;
-  S *bias_data = bias->data<S>();
-  for (int32_t i = 0; i < m; ++i) {
-    for (int32_t j = 0; j < n; ++j) {
-      S bias_v = bias_data[j];
-      if (std::is_same<T, int8_t>::value) {
-        int32_t r = 0;
-        for (int32_t p = 0; p < k; p++) {
-          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
-        }
-        r = qadd_int32(r, bias_v);
-        c(i, j) = qscale_int32(r, scale_v);
-      } else {
-        T r = 0;
-        for (int32_t p = 0; p < k; p++) {
-          r += a(i, p) * b(p, j);
-        }
-        r += bias_v;
-        c(i, j) = r;
-      }
-    }
-  }
-
-  int32_t eq = 0;
-  int32_t neq = 0;
-  for (int32_t i = 0; i < m * n; ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == c[i],
-                          "The execution of test_fusion_fc_op is failed!");
-    if (output_data[i] == c[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
-            << " neq=" << neq << std::endl;
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  paddle_mobile::TestFcOP<float, float>();
-  return 0;
-}
diff --git a/mobile/test/operators/test_gru_op.cpp b/mobile/test/operators/test_gru_op.cpp
deleted file mode 100644
index d17b2d6a2db52b34e76af0e8ac40359980bc025f..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_gru_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/gru_op.h"
-
-namespace paddle_mobile {
-
-template <typename Itype, typename Otype>
-int TestGruOp(int in_channels, int out_channels, std::string opname) {
-  size_t input_c = in_channels;
-  size_t output_c = out_channels;
-  paddle_mobile::framework::LoD lod{{0, input_c}};
-  int batch_size = lod.size();
-  framework::DDim input_shape = framework::make_ddim({input_c, output_c * 3});
-  framework::DDim weight_shape = framework::make_ddim({output_c, output_c * 3});
-  framework::DDim h0_shape = framework::make_ddim({batch_size, output_c});
-  framework::DDim bias_shape = framework::make_ddim({batch_size, output_c * 3});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["Input"] = std::vector<std::string>({"input"});
-  inputs["Weight"] = std::vector<std::string>({"weight"});
-  inputs["H0"] = std::vector<std::string>({"h0"});
-  inputs["Bias"] = std::vector<std::string>({"bias"});
-
-  outputs["BatchGate"] = std::vector<std::string>({"output_batch_gate"});
-  outputs["BatchResetHiddenPrev"] =
-      std::vector<std::string>({"output_batch_reset_hidden_prev"});
-  outputs["BatchHidden"] = std::vector<std::string>({"output_batch_hidden"});
-  outputs["Hidden"] = std::vector<std::string>({"output_hidden"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(input, input_shape, -127, 127);
-  input->set_lod(lod);
-
-  auto weight_var = scope.get()->Var("weight");
-  auto weight = weight_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(weight, weight_shape, -127, 127);
-
-  auto h0_var = scope.get()->Var("h0");
-  auto h0 = h0_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(h0, h0_shape, -127, 127);
-
-  auto bias_var = scope.get()->Var("bias");
-  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<Itype>(bias, bias_shape, -127, 127);
-
-  auto batch_gate_var = scope.get()->Var("output_batch_gate");
-  auto batch_reset_hidden_prev_var =
-      scope.get()->Var("output_batch_reset_hidden_prev");
-  auto batch_hidden_var = scope.get()->Var("output_batch_hidden");
-  auto hidden_var = scope.get()->Var("output_hidden");
-
-  framework::AttributeMap attrs;
-  attrs["activation"].Set<std::string>(std::string("relu"));
-  attrs["gate_activation"].Set<std::string>(std::string("sigmoid"));
-  attrs["is_reverse"].Set<bool>(false);
-
-  auto *op = new operators::GruOp<CPU, float>("gru", inputs, outputs, attrs,
-                                              scope.get());
-  op->InferShape();
-  op->Init();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time1 = time();
-  for (int i = 0; i < 10; ++i) {
-    op->Run();
-  }
-  auto time2 = time();
-  std::ofstream out_file("./out_gru.txt", std::ios::app);
-  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
-           << std::endl;
-  out_file.close();
-
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  paddle_mobile::TestGruOp<float, float>(384, 120, "gru_forward");
-  return 0;
-}
diff --git a/mobile/test/operators/test_im2sequence_op.cpp b/mobile/test/operators/test_im2sequence_op.cpp
deleted file mode 100644
index 247e6a466f9a591f434ef714117eeb31999de164..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_im2sequence_op.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/im2sequence_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestIm2SequenceOp {
- public:
-  explicit TestIm2SequenceOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "im2sequence" &&
-            op->Input("X")[0] == "conv2d_19.tmp_1") {
-          DLOG << " im2squence attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-
-          std::shared_ptr<operators::Im2SequenceOp<Dtype, float>> lrn =
-              std::make_shared<operators::Im2SequenceOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_bn(const Tensor &t1) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("conv2d_19.tmp_1");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-    Variable *output = scope->Var("im2sequence_0.tmp_0");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({2, 12});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_bn(t1, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_bn(const Tensor &t1, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestIm2SequenceOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Im2Sequence Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_eng) + "/model",
-                             std::string(g_eng) + "/params");
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx;
-  SetupTensor<float>(&inputx, {1, 2, 6, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-
-  paddle_mobile::framework::TestIm2SequenceOp<paddle_mobile::CPU>
-      testIm2SequenceOp(program);
-
-  auto output_op = testIm2SequenceOp.predict_bn(inputx);
-  auto *output_op_ptr = output_op->data<float>();
-
-  auto input_dim = inputx.numel() / inputx.dims()[0];
-  DLOG << " input : ";
-  for (int i = 0; i < inputx.dims()[0]; ++i) {
-    for (int j = 0; j < input_dim; ++j) {
-      DLOGF("%f ", inputx_ptr[i * input_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  auto output_dim = output_op->numel() / output_op->dims()[0];
-  DLOG << " output : ";
-  for (int i = 0; i < output_op->dims()[0]; ++i) {
-    for (int j = 0; j < output_dim; ++j) {
-      DLOGF("%f ", output_op_ptr[i * output_dim + j]);
-    }
-    DLOGF("\n");
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_increment_op.cpp b/mobile/test/operators/test_increment_op.cpp
deleted file mode 100644
index 32f6a57b60c2abd5c6ba76ff646d00909845c3db..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_increment_op.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/increment_op.h"
-
-namespace paddle_mobile {
-
-template <typename T>
-void Increment(const framework::Tensor *input, framework::Tensor *out,
-               int step) {
-  auto input_data = input->data<T>();
-  auto out_data = out->data<T>();
-  *out_data = *input_data + step;
-}
-
-int TestIncrementOp(const std::vector<int> input_shape, int step) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(x, input_dims, 0, 100);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["step"].Set<int>(step);
-
-  auto *op = new operators::IncrementOp<CPU, float>(
-      "increment", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Increment<float>(x, &output_cmp, step);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestIncrementOp({1}, 4);
-  paddle_mobile::TestIncrementOp({1}, 10);
-  DLOG << "test increment op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_is_empty_op.cpp b/mobile/test/operators/test_is_empty_op.cpp
deleted file mode 100644
index 9bf9443acdb73d36067ec5394dcf128996fa78d6..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_is_empty_op.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/is_empty_op.h"
-
-namespace paddle_mobile {
-
-void IsEmpty(const framework::Tensor *input, framework::Tensor *out) {
-  out->data<bool>()[0] = input->numel() == 0;
-}
-
-int TestIsEmptyOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(x, input_dims, 0, 100);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::IsEmptyOp<CPU, float>("is_empty", inputs, outputs,
-                                                  attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  IsEmpty(x, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestIsEmptyOp({1, 3, 100, 100});
-  paddle_mobile::TestIsEmptyOp({0});
-  DLOG << "test is_empty op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_leaky_relu_op.cpp b/mobile/test/operators/test_leaky_relu_op.cpp
deleted file mode 100644
index 3349fbd92c7a77ed29c52d386e39dc4577267fbc..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_leaky_relu_op.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void LeakyRelu(const framework::Tensor *X, framework::Tensor *Y, float alpha) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = std::max(x[i], x[i] * alpha);
-  }
-}
-
-int TestLeakyReluOp(const std::vector<int> input_shape, float alpha) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["alpha"].Set<float>(alpha);
-
-  auto *op = new operators::LeakyReluOp<CPU, float>(
-      "leaky_relu", inputs, outputs, attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  LeakyRelu(input, &output_cmp, alpha);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLeakyReluOp({1, 1, 2, 3}, 0.2f);
-  paddle_mobile::TestLeakyReluOp({1, 3, 11, 22}, 0.3f);
-  paddle_mobile::TestLeakyReluOp({1, 32, 112, 112}, 0.4f);
-  std::cout << "test leaky_relu op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_less_than_op.cpp b/mobile/test/operators/test_less_than_op.cpp
deleted file mode 100644
index 35f5e6fe74eb16a206185573c7a431f4f2389f77..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_less_than_op.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/compare_op.h"
-
-namespace paddle_mobile {
-
-template <typename T>
-void LessThan(const framework::Tensor *X, const framework::Tensor *Y,
-              const int Axis, framework::Tensor *Out) {
-  const T *x = X->data<T>();
-  const T *y = Y->data<T>();
-  bool *output = Out->mutable_data<bool>();
-  const auto &x_dims = X->dims();
-  const auto &y_dims = Y->dims();
-  /// axis = -1 represent the last dimensions.
-  int axis = (Axis == -1 ? x_dims.size() - y_dims.size() : Axis);
-  int batch = 1;
-  int channels = 1;
-  int elementwise_num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    elementwise_num *= x_dims[i];
-  }
-  // less than
-  for (int i = 0; i < batch; ++i) {
-    for (int j = 0; j < channels; ++j) {
-      int x_offset = (i * channels + j) * elementwise_num;
-      int y_offset = j * elementwise_num;
-      for (int k = 0; k < elementwise_num; ++k) {
-        output[x_offset + k] = (x[x_offset + k] < y[y_offset]);
-      }
-    }
-  }
-}
-
-template <typename T>
-int TestLessThanOp(const std::vector<int> &x_shape,
-                   const std::vector<int> &y_shape, const int axis) {
-  framework::DDim xdims = framework::make_ddim(x_shape);
-  framework::DDim ydims = framework::make_ddim(y_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputx"});
-  inputs["Y"] = std::vector<std::string>({"inputy"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputx_var = scope.get()->Var("inputx");
-  auto inputx = inputx_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputx, xdims, static_cast<T>(-100), static_cast<T>(100));
-  auto inputy_var = scope.get()->Var("inputy");
-  auto inputy = inputy_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<T>(inputy, ydims, static_cast<T>(-100), static_cast<T>(100));
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["axis"].Set<int>(axis);
-  auto *op = new operators::LessThanOp<CPU, float>("less_than", inputs, outputs,
-                                                   attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LessThan<T>(inputx, inputy, axis, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLessThanOp<float>({1, 2, 3}, {1, 2, 3}, 0);
-  paddle_mobile::TestLessThanOp<float>({10, 2, 1}, {10, 2, 1}, 0);
-
-  paddle_mobile::TestLessThanOp<float>({2, 10, 1}, {1, 10, 1}, 1);
-  paddle_mobile::TestLessThanOp<float>({10, 2, 1}, {1, 2, 1}, 1);
-
-  paddle_mobile::TestLessThanOp<int64_t>({1, 2, 3}, {1, 2, 3}, 0);
-  paddle_mobile::TestLessThanOp<int64_t>({10, 2, 1}, {10, 2, 1}, 0);
-
-  paddle_mobile::TestLessThanOp<int64_t>({2, 10, 1}, {1, 10, 1}, 1);
-  paddle_mobile::TestLessThanOp<int64_t>({10, 2, 1}, {1, 2, 1}, 1);
-
-  std::cout << "test less_than op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_log_op.cpp b/mobile/test/operators/test_log_op.cpp
deleted file mode 100644
index f0bba93d546258fdc98eb3fe8021880a0f10de6a..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_log_op.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Log(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = log(x[i]);
-  }
-}
-
-int TestLogOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, 0.0001, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::LogOp<CPU, float>("log", inputs, outputs, attrs,
-                                              scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Log(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogOp({1, 1, 2, 3});
-  paddle_mobile::TestLogOp({1, 3, 11, 22});
-  paddle_mobile::TestLogOp({1, 32, 112, 112});
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_and_op.cpp b/mobile/test/operators/test_logical_and_op.cpp
deleted file mode 100644
index 380b253efe43ce8b4ed262fdf5b971353fcac654..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_logical_and_op.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalAnd(const framework::Tensor *inputX,
-                const framework::Tensor *inputY, framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto y_data = inputY->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    *output_data = *x_data && *y_data;
-    x_data++;
-    y_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalAndOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  inputs["Y"] = std::vector<std::string>({"inputY"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto y_var = scope.get()->Var("inputY");
-  auto y = y_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(y, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalAndOp<CPU, float>(
-      "logical_and", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalAnd(x, y, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalAndOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalAndOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalAndOp({1, 16, 32, 32});
-  DLOG << "test logical_and op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_not_op.cpp b/mobile/test/operators/test_logical_not_op.cpp
deleted file mode 100644
index 8d88362210d6591bb88b04fe0352a44e59761e2d..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_logical_not_op.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalNot(const framework::Tensor *inputX, framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    *output_data = !*x_data;
-    x_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalNotOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalNotOp<CPU, float>(
-      "logical_not", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalNot(x, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalNotOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalNotOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalNotOp({1, 16, 32, 32});
-  DLOG << "test logical_not op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_or_op.cpp b/mobile/test/operators/test_logical_or_op.cpp
deleted file mode 100644
index 9ea555b65b3453b44de57b7bdb8b1926f34e5b66..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_logical_or_op.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalOr(const framework::Tensor *inputX, const framework::Tensor *inputY,
-               framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto y_data = inputY->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    *output_data = *x_data || *y_data;
-    x_data++;
-    y_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalOrOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  inputs["Y"] = std::vector<std::string>({"inputY"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto y_var = scope.get()->Var("inputY");
-  auto y = y_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(y, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalOrOp<CPU, float>(
-      "logical_or", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalOr(x, y, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalOrOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalOrOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalOrOp({1, 16, 32, 32});
-  DLOG << "test logical_or op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_logical_xor_op.cpp b/mobile/test/operators/test_logical_xor_op.cpp
deleted file mode 100644
index a776de0e8b9ad18455b1b45839673601886ecb0b..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_logical_xor_op.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/logical_op.h"
-
-namespace paddle_mobile {
-
-void LogicalXor(const framework::Tensor *inputX,
-                const framework::Tensor *inputY, framework::Tensor *output) {
-  auto x_data = inputX->data<bool>();
-  auto y_data = inputY->data<bool>();
-  auto output_data = output->data<bool>();
-  for (int i = 0; i < inputX->numel(); ++i) {
-    bool x = *x_data;
-    bool y = *y_data;
-    *output_data = (x || y) && !(x && y);
-    x_data++;
-    y_data++;
-    output_data++;
-  }
-}
-
-int TestLogicalXorOp(const std::vector<int> input_shape) {
-  framework::DDim input_dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"inputX"});
-  inputs["Y"] = std::vector<std::string>({"inputY"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto x_var = scope.get()->Var("inputX");
-  auto x = x_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(x, input_dims, 0, 1);
-
-  auto y_var = scope.get()->Var("inputY");
-  auto y = y_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<bool>(y, input_dims, 0, 1);
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-
-  auto *op = new operators::LogicalXorOp<CPU, float>(
-      "logical_xor", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  bool *output_cmp_data = output_cmp.mutable_data<bool>(output->dims());
-  LogicalXor(x, y, &output_cmp);
-
-  const bool *output_data = output->data<bool>();
-  for (int i = 0; i < output->numel(); ++i) {
-    if (output_data[i] != output_cmp_data[i]) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestLogicalXorOp({1, 1, 2, 3});
-  paddle_mobile::TestLogicalXorOp({1, 3, 11, 12});
-  paddle_mobile::TestLogicalXorOp({1, 16, 32, 32});
-  DLOG << "test logical_xor op pass.";
-  return 0;
-}
diff --git a/mobile/test/operators/test_lrn_op.cpp b/mobile/test/operators/test_lrn_op.cpp
deleted file mode 100644
index 5d1ac9b4dd7225112ace8bfbb13f926502c77b94..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_lrn_op.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/lrn_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_googlenet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::LrnOp<paddle_mobile::CPU, float>>
-      executor(program, "lrn");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {3, 4, 2, 2}, 0, 1);
-  input_tensors.push_back(input1);
-
-  // 2. input_names
-  vector<string> input_names({
-      "pool2d_0.tmp_0",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"pool1_norm1.tmp_1"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({3, 4, 2, 2});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  DLOG << " LrnOp input: ";
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      for (int c = 0; c < 2; c++) {
-        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", input1_data[i * 16 + j * 4 + c * 2 + d]);
-        }
-        DLOGF("\n");
-      }
-      DLOGF("\n");
-    }
-    DLOGF("\n");
-  }
-  DLOG << " LrnOp output: ";
-  for (int i = 0; i < 3; i++) {
-    for (int j = 0; j < 4; j++) {
-      for (int c = 0; c < 2; c++) {
-        for (int d = 0; d < 2; d++) {
-          DLOGF("%f ", output0_data[i * 16 + j * 4 + c * 2 + d]);
-        }
-        DLOGF("\n");
-      }
-      DLOGF("\n");
-    }
-    DLOGF("\n");
-  }
-  DLOG << input1_data[0] << " / ((1 + 0.00002 * ( " << input1_data[0] << "^2 + "
-       << input1_data[4] << "^2 + " << input1_data[8] << "^2 ))^0.75) = ";
-  DLOG << output0_data[0];
-  return 0;
-}
diff --git a/mobile/test/operators/test_mul_op.cpp b/mobile/test/operators/test_mul_op.cpp
deleted file mode 100644
index 6ac2c455647a170d9c2f2e19f6dc6403f5d822b1..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_mul_op.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/mul_op.h"
-
-#define a(i, j) a[(i)*lda + (j)]
-#define b(i, j) b[(i)*ldb + (j)]
-#define c(i, j) c[(i)*ldc + (j)]
-
-namespace paddle_mobile {
-using framework::AttributeMap;
-using framework::DDim;
-using framework::Scope;
-using framework::make_ddim;
-template <typename I, typename O>
-int TestMulOP() {
-  int32_t m = 1024;
-  int32_t n = 1024;
-  int32_t k = 1024;
-  int32_t lda = k;
-  int32_t ldb = n;
-  int32_t ldc = n;
-  DDim inputA_shape = make_ddim({m, k});
-  DDim inputB_shape = make_ddim({k, n});
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<Scope>();
-  inputs["X"] = std::vector<std::string>({"inputA"});
-  inputs["Y"] = std::vector<std::string>({"inputB"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto inputA_var = scope.get()->Var("inputA");
-  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<I>(inputA, inputA_shape, -127, 127);
-  auto inputB_var = scope.get()->Var("inputB");
-  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<I>(inputB, inputB_shape, -127, 127);
-
-  auto output_var = scope.get()->Var("output");
-  AttributeMap attrs;
-  attrs["x_num_col_dims"].Set<int>(1);
-  attrs["y_num_col_dims"].Set<int>(1);
-  auto *op = new operators::MulOp<CPU, float>("mul", inputs, outputs, attrs,
-                                              scope.get());
-  op->InferShape();
-  op->Run();
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const O *output_data = output->data<O>();
-  // compare
-  O *c = static_cast<O *>(memory::Alloc(sizeof(O) * m * n));
-  I *a = inputA->data<I>();
-  I *b = inputB->data<I>();
-  for (int32_t i = 0; i < m; ++i) {
-    for (int32_t j = 0; j < n; ++j) {
-      O r = 0;
-      for (int32_t p = 0; p < k; p++) {
-        r += static_cast<O>(a(i, p)) * static_cast<O>(b(p, j));
-      }
-      c(i, j) = r;
-    }
-  }
-
-  int32_t eq = 0;
-  int32_t neq = 0;
-  for (int32_t i = 0; i < m * n; ++i) {
-    PADDLE_MOBILE_ENFORCE(
-        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
-        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
-    if (output_data[i] == c[i]) {
-      ++eq;
-    } else {
-      ++neq;
-    }
-  }
-  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
-            << " neq=" << neq << std::endl;
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
-  paddle_mobile::TestMulOP<int8_t, int32_t>();
-  paddle_mobile::TestMulOP<float, float>();
-  return 0;
-}
diff --git a/mobile/test/operators/test_multiclass_nms_op.cpp b/mobile/test/operators/test_multiclass_nms_op.cpp
deleted file mode 100644
index 782dd6af94c75501272ec09f0bf014e0254456b3..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_multiclass_nms_op.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/multiclass_nms_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestMultiClassNMSOp {
- public:
-  explicit TestMultiClassNMSOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "multiclass_nms" &&
-            op->Input("BBoxes")[0] == "box_coder_0.tmp_0") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " BBoxes is : " << op->Input("BBoxes")[0];
-          DLOG << " Scores is : " << op->Input("Scores")[0];
-          DLOG << " Out is : " << op->Output("Out")[0];
-          DLOG << " keep_top_k : "
-               << op->GetAttrMap().at("keep_top_k").Get<int>();
-          DLOG << " background_label : "
-               << op->GetAttrMap().at("background_label").Get<int>();
-          DLOG << " nms_eta : " << op->GetAttrMap().at("nms_eta").Get<float>();
-          DLOG << " nms_threshold : "
-               << op->GetAttrMap().at("nms_threshold").Get<float>();
-          DLOG << " nms_top_k : "
-               << op->GetAttrMap().at("nms_top_k").Get<int>();
-          DLOG << " score_threshold : "
-               << op->GetAttrMap().at("score_threshold").Get<float>();
-          std::shared_ptr<operators::MultiClassNMSOp<Dtype, float>> priorbox =
-              std::make_shared<operators::MultiClassNMSOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(priorbox);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("box_coder_0.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("transpose_12.tmp_0");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *output = scope->Var("detection_output_0.tmp_0");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({1917, 6});
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t1, t2, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestMultiClassNMSOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run MulticlassNMS Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {1, 2, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-  const float x1[] = {0, 0, 100, 100, 50, 50, 150, 150};
-  for (int i = 0; i < 8; ++i) {
-    *(inputx1_ptr + i) = x1[i];
-  }
-
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {1, 2, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-  const float x2[] = {0.4, 0.3, 0.6, 0.7};
-  for (int i = 0; i < 4; ++i) {
-    *(inputx2_ptr + i) = x2[i];
-  }
-
-  paddle_mobile::framework::TestMultiClassNMSOp<paddle_mobile::CPU>
-      testMultiClassNMSOp(program);
-
-  auto output = testMultiClassNMSOp.predict(inputx1, inputx2);
-  auto *output_ptr = output->data<float>();
-
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << output_ptr[i];
-  }
-
-  // test multi point
-  paddle_mobile::framework::Tensor inputx3;
-  SetupTensor<float>(&inputx3, {1, 2, 8}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx3_ptr = inputx3.data<float>();
-  const float x3[] = {0,  0,  100, 0,  100, 100, 0,  100,
-                      50, 50, 150, 50, 150, 150, 50, 150};
-  for (int i = 0; i < 16; ++i) {
-    *(inputx3_ptr + i) = x3[i];
-  }
-
-  auto output2 = testMultiClassNMSOp.predict(inputx3, inputx2);
-  auto *output_ptr2 = output2->data<float>();
-
-  for (int i = 0; i < output2->numel(); ++i) {
-    DLOG << output_ptr2[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_polygon_box_transform_op.cpp b/mobile/test/operators/test_polygon_box_transform_op.cpp
deleted file mode 100644
index bfd8fb3cc2f1724a585e8698b263e7bed5d268c0..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_polygon_box_transform_op.cpp
+++ /dev/null
@@ -1,125 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/polygon_box_transform_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestPolygonBoxTransformOp {
- public:
-  explicit TestPolygonBoxTransformOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "polygon_box_transform") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " input is : " << op->Input("Input")[0];
-          input_var_name = op->Input("Input")[0];
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " output is : " << op->Output("Output")[0];
-          output_var_name = op->Output("Output")[0];
-          std::shared_ptr<operators::PolygonBoxTransformOp<Dtype, float>>
-              op_ptr = std::make_shared<
-                  operators::PolygonBoxTransformOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          return;
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t) {
-    auto scope = program_.scope.get();
-    Variable *input_feed_value = scope->Var(input_var_name);
-    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
-    tensor_input->ShareDataWith(t);
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string input_var_name;
-  string output_var_name;
-
-  void predict(const Tensor &t, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestPolygonBoxTransformOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run PolygonBoxTransform Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr));
-
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 8, 1, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_ptr = input.data<float>();
-  for (int i = 0; i < 16; ++i) {
-    *(input_ptr + i) = i;
-  }
-  DLOG << "input : ";
-  for (int i = 0; i < input.numel(); ++i) {
-    DLOG << " index " << i << " : " << input_ptr[i];
-  }
-
-  paddle_mobile::framework::TestPolygonBoxTransformOp<paddle_mobile::CPU>
-      testPolygonBoxTransformOp(program);
-
-  auto output = testPolygonBoxTransformOp.predict(input);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_pool_op.cpp b/mobile/test/operators/test_pool_op.cpp
deleted file mode 100644
index 44bb132e7923c918fe1d610579d1bdb8983f1a74..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_pool_op.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/math/pooling.h"
-#include "operators/pool_op.h"
-
-namespace paddle_mobile {
-
-namespace math = operators::math;
-
-template <int PoolType, int Kernel, int Pad, int Stride>
-int TestPoolOp(int in_channels, int in_height, int in_width) {
-  int kernel_h = Kernel;
-  int kernel_w = Kernel;
-  int pad_h = Pad;
-  int pad_w = Pad;
-  int stride_h = Stride;
-  int stride_w = Stride;
-  std::string pooling_type = (PoolType == 0 ? "max" : "avg");
-
-  int batch_size = 1;
-  int input_c = in_channels;
-  int input_h = in_height;
-  int input_w = in_width;
-
-  framework::DDim input_shape =
-      framework::make_ddim({batch_size, input_c, input_h, input_w});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, input_shape, -127, 127);
-
-  //  for (int i = 0; i < input->numel(); ++i) {
-  //    DLOG << "input[" << i << "] = " << input->data<float>()[i];
-  //  }
-
-  auto output_var = scope.get()->Var("output");
-  framework::AttributeMap attrs;
-  attrs["pooling_type"].Set<std::string>(pooling_type);
-  attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
-  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
-  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
-  attrs["ceil_mode"].Set<bool>(true);
-  //  attrs["ceil_mode"].Set<bool>(false);
-  attrs["global_pooling"].Set<bool>(false);
-  attrs["exclusive"].Set<bool>(true);
-
-  auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  framework::Tensor output_cmp;
-  output_cmp.mutable_data<float>(output->dims());
-
-  if (pooling_type == "avg") {
-    math::Pooling<AVG>()(*input, std::vector<int>{kernel_h, kernel_w},
-                         std::vector<int>{stride_h, stride_w},
-                         std::vector<int>{pad_h, pad_w}, &output_cmp);
-  } else {
-    math::Pooling<MAX>()(*input, std::vector<int>{kernel_h, kernel_w},
-                         std::vector<int>{stride_h, stride_w},
-                         std::vector<int>{pad_h, pad_w}, &output_cmp);
-  }
-
-  // compare results
-  const float *output_data = output->data<float>();
-  float *output_cmp_data = output_cmp.data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    //    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-    //                          "output[%d] = %d, output_cmp[%d] = %d", i,
-    //                          output_data[i], i, output_cmp_data[i]);
-    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-}  // namespace paddle_mobile
-
-int Test(const int in_channels, const int in_height, const int in_width) {
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<0, 3, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<1, 3, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=3, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<0, 3, 5, 2>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=3, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<1, 3, 5, 2>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<0, 2, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=0, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 0, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=1, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 1, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=2, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 2, 1>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=5, stride=1";
-  paddle_mobile::TestPoolOp<1, 2, 5, 1>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=max, kernel=2, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<0, 2, 5, 2>(in_channels, in_height, in_width);
-
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=0, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 0, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=1, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 1, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=2, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 2, 2>(in_channels, in_height, in_width);
-  LOG(paddle_mobile::kLOG_INFO)
-      << "float, pooling_type=avg, kernel=2, pad=5, stride=2";
-  paddle_mobile::TestPoolOp<1, 2, 5, 2>(in_channels, in_height, in_width);
-}
-
-int main(int argc, char *argv[]) {
-  //  if (argc < 4) {
-  //    LOG(paddle_mobile::kLOG_INFO)
-  //        << "Usage:\n"
-  //        << "  ./test-pool-op in_channels in_height in_width \n"
-  //        << "  params:\n"
-  //        << "   -in_channels: int, input image's channels\n"
-  //        << "   -in_height: int, input image's height\n"
-  //        << "   -in_width: int, input image's width\n";
-  //    return 1;
-  //  }
-  //  int in_channels = atoi(argv[1]);
-  //  int in_height = atoi(argv[2]);
-  //  int in_width = atoi(argv[3]);
-  Test(1, 10, 10);
-  Test(1, 50, 50);
-  Test(32, 10, 10);
-  Test(32, 50, 50);
-}
diff --git a/mobile/test/operators/test_prelu_op.cpp b/mobile/test/operators/test_prelu_op.cpp
deleted file mode 100644
index f98c9904ae3799cb863142b0fcb332c74c91ba98..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_prelu_op.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../executor_for_test.h"
-#include "../test_include.h"
-#include "operators/prelu_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_resnet);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
-
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::PReluOp<paddle_mobile::CPU, float>>
-      executor(program, "prelu");
-
-  // 1. input_tensors;
-  vector<Tensor> input_tensors;
-
-  Tensor input1;
-  auto input1_data = CreateInput<float>(&input1, {1, 2, 3, 4}, -1, 1);
-  input_tensors.push_back(input1);
-
-  // 2. input_names
-  vector<string> input_names({
-      "batch_norm_0.tmp_2",
-  });
-
-  // 3. output_names
-  vector<string> output_names({"batch_norm_0.tmp_3"});
-
-  // 4. out_dims;
-  vector<DDim> out_ddims;
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 2, 3, 4});
-  out_ddims.push_back(out_ddim);
-
-  auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
-                                            output_names, out_ddims);
-
-  auto output0_data = output[0]->data<float>();
-
-  for (int j = 0; j < output[0]->numel(); ++j) {
-    DLOG << " value of output: " << output0_data[j];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_prior_box_op.cpp b/mobile/test/operators/test_prior_box_op.cpp
deleted file mode 100644
index b2f05a18e6e93efab288b93338fa0fefc710ff84..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_prior_box_op.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/prior_box_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestPriorBoxOp {
- public:
-  explicit TestPriorBoxOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (auto op : ops) {
-        if (op->Type() == "prior_box" &&
-            op->Input("Input")[0] == "batch_norm_26.tmp_3") {
-          DLOG << " mul attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input is : " << op->Input("Input")[0];
-          DLOG << " Image is : " << op->Input("Image")[0];
-          DLOG << " Output Boxes is : " << op->Output("Boxes")[0];
-          DLOG << " Output Variances is : " << op->Output("Variances")[0];
-          DLOG << " offset : " << op->GetAttrMap().at("offset").Get<float>();
-          DLOG << " step_h : " << op->GetAttrMap().at("step_h").Get<float>();
-          DLOG << " step_w : " << op->GetAttrMap().at("step_w").Get<float>();
-          DLOG << " flip : " << op->GetAttrMap().at("flip").Get<bool>();
-          DLOG << " clip : " << op->GetAttrMap().at("clip").Get<bool>();
-          //                            DLOG << " variances : " <<
-          //                            op->GetAttrMap().at("variances").Get<std::vector<float>>();
-          //                            DLOG << " aspect_ratios : " <<
-          //                            op->GetAttrMap().at("aspect_ratios").Get<std::vector<float>>();
-          //                            DLOG << " min_sizes : " <<
-          //                            op->GetAttrMap().at("min_sizes").Get<std::vector<float>>();
-          //                            DLOG << " max_sizes : " <<
-          //                            op->GetAttrMap().at("max_sizes").Get<std::vector<float>>();
-          std::shared_ptr<operators::PriorBoxOp<Dtype, float>> priorbox =
-              std::make_shared<operators::PriorBoxOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(priorbox);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_priorbox(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("image");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("batch_norm_26.tmp_3");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *boxes_output = scope->Var("prior_box_1.tmp_0");
-    auto *boxes_output_tensor = boxes_output->GetMutable<LoDTensor>();
-    boxes_output_tensor->mutable_data<float>({10, 10, 6, 4});
-
-    Variable *variances_output = scope->Var("prior_box_1.tmp_1");
-    auto *variances_output_tesnor = variances_output->GetMutable<LoDTensor>();
-    variances_output_tesnor->mutable_data<float>({10, 10, 6, 4});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> outboxes_tensor = std::make_shared<LoDTensor>();
-    outboxes_tensor.reset(boxes_output_tensor);
-
-    std::shared_ptr<Tensor> outvars_tensor = std::make_shared<LoDTensor>();
-    outvars_tensor.reset(variances_output_tesnor);
-    predict_priorbox(t1, t2, 0);
-
-    return outboxes_tensor;
-    // return outvars_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_priorbox(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestPriorBoxOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run PriorBoxOp Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-
-  /// input x (1,3,300,300)
-  paddle_mobile::framework::Tensor input_image;
-  SetupTensor<float>(&input_image, {1, 3, 300, 300}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_image_ptr = input_image.data<float>();
-
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {1, 1024, 10, 10}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-
-  paddle_mobile::framework::TestPriorBoxOp<paddle_mobile::CPU> testPriorBoxOp(
-      program);
-
-  auto output_priorbox = testPriorBoxOp.predict_priorbox(input_image, inputx1);
-  auto *output_priorbox_ptr = output_priorbox->data<float>();
-
-  for (int i = 0; i < output_priorbox->numel(); i++) {
-    DLOG << output_priorbox_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_quantize_op.cpp b/mobile/test/operators/test_quantize_op.cpp
deleted file mode 100644
index d8e72e9b1472d0de48143b37ee2a7fe48ad4e174..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_quantize_op.cpp
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/quantize_op.h"
-
-namespace paddle_mobile {
-namespace round {
-enum RoundType {
-  RoundToEven = 0,
-  RoundAwayZero = 1,
-  RoundTowardsZero = 2,
-};
-}
-
-template <round::RoundType T>
-struct Round {
-  int8_t operator()(float x);
-};
-
-template <>
-struct Round<round::RoundAwayZero> {
-  int8_t operator()(float x) { return std::round(x); }
-};
-
-template <>
-struct Round<round::RoundTowardsZero> {
-  int8_t operator()(float x) { return int8_t(x); }
-};
-
-template <>
-struct Round<round::RoundToEven> {
-  int8_t operator()(float x) {
-    float v = std::round(x);
-    int32_t q = static_cast<int32_t>(v);
-    if (abs(abs(q - v) - 0.5) <= 0) {
-      if (abs(q) % 2 != 0) {
-        q = q + ((q > 0) ? -1 : 1);
-      }
-    }
-    return static_cast<int8_t>(q);
-  }
-};
-
-template <round::RoundType T>
-static void quantize(const Tensor *input, const float scale, Tensor *output) {
-  int batch_size = input->dims()[0];
-  int channels = input->dims()[1];
-  int input_h = input->dims()[2];
-  int input_w = input->dims()[3];
-  int output_h = output->dims()[2];
-  int output_w = output->dims()[3];
-  size_t input_spatial = input_h * input_w;
-  size_t output_spatial = output_h * output_w;
-  const float *x = input->data<float>();
-  int8_t *y = output->mutable_data<int8_t>();
-
-  for (int nc = 0; nc < batch_size * channels; ++nc) {
-    const float *xh = x + nc * input_spatial;
-    int8_t *yh = y + nc * output_spatial;
-    for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) {
-      for (int w = 0; w < input_w; ++w) {
-        yh[w] = Round<T>()(xh[w] * scale);
-      }
-    }
-  }
-}
-
-static float find_abs_max(const Tensor *input) {
-  float max_abs = 0.f;
-  const float *x = input->data<float>();
-  size_t size = input->numel();
-  for (size_t i = 0; i < size; ++i) {
-    float value = std::abs(x[i]);
-    if (value > max_abs) {
-      max_abs = value;
-    }
-  }
-  return max_abs;
-}
-
-int TestQuqntizeOp(const int batch_size, const int channel, const int height,
-                   const int width) {
-  DLOG << "batch_size: " << batch_size << ", channel: " << channel
-       << ", height: " << height << ", width: " << width;
-  framework::DDim dim =
-      framework::make_ddim({batch_size, channel, height, width});
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  outputs["OutScale"] = std::vector<std::string>({"output_scale"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dim, -100.f, 100.f);
-
-  auto output_var = scope.get()->Var("output");
-  auto output_scale_var = scope.get()->Var("output_scale");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
-                                                   attrs, scope.get());
-  op->InferShape();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  const int8_t *output_data = output->data<int8_t>();
-  auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
-  const float *output_scale_data = output_scale->data<float>();
-
-  float output_scale_cmp = find_abs_max(input);
-  PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
-                        "output_scale = %.6f, output_scale_cmp = %.6f",
-                        output_scale_cmp, output_scale_data[0]);
-
-  framework::Tensor output_cmp;
-  output_cmp.Resize(output->dims());
-  float scale = 127 / output_scale_cmp;
-  quantize<round::RoundTowardsZero>(input, scale, &output_cmp);
-  int8_t *output_cmp_data = output_cmp.data<int8_t>();
-  for (int i = 0; i < output->numel(); ++i) {
-    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
-                          "output[%d] = %d, output_cmp[%d] = %d", i,
-                          static_cast<int>(output_data[i]), i,
-                          static_cast<int>(output_cmp_data[i]));
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestQuqntizeOp(1, 10, 10, 5);
-  TestQuqntizeOp(1, 111, 111, 5);
-  TestQuqntizeOp(5, 111, 111, 5);
-}
diff --git a/mobile/test/operators/test_relu6_op.cpp b/mobile/test/operators/test_relu6_op.cpp
deleted file mode 100644
index 8681c4155d99309ca30147304e9e1480d3258c61..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_relu6_op.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Relu6(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    float q = x[i];
-    y[i] = std::min(std::max(0.f, q), 6.f);
-  }
-}
-
-int TestRelu6Op(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["threshold"].Set<float>(6.f);
-  auto *op = new operators::Relu6Op<CPU, float>("relu6", inputs, outputs, attrs,
-                                                scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Relu6(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestRelu6Op({1, 1, 2, 3});
-  paddle_mobile::TestRelu6Op({1, 3, 11, 22});
-  paddle_mobile::TestRelu6Op({1, 32, 112, 112});
-  std::cout << "test relu6 op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_relu_op.cpp b/mobile/test/operators/test_relu_op.cpp
deleted file mode 100644
index d173845386929981d62e00208e19571a16ae636d..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_relu_op.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Relu(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    float q = x[i];
-    y[i] = std::max(0.f, q);
-  }
-}
-
-int TestReluOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::ReluOp<CPU, float>("relu", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Relu(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestReluOp({1, 1, 2, 3});
-  paddle_mobile::TestReluOp({1, 3, 11, 22});
-  paddle_mobile::TestReluOp({1, 32, 112, 112});
-  std::cout << "test relu op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_reshape2_op.cpp b/mobile/test/operators/test_reshape2_op.cpp
deleted file mode 100644
index 69edd34bf64d757d7d963fdeca9519f29bf1b56b..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_reshape2_op.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/reshape2_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestReshape2Op {
- public:
-  explicit TestReshape2Op(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "reshape2") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
-          for (std::unordered_map<std::string, Attribute>::iterator it =
-                   attrs.begin();
-               it != attrs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " inputs size: " << op->GetInputs().size();
-          VariableNameMap inputs = op->GetInputs();
-          for (VariableNameMap::iterator it = inputs.begin();
-               it != inputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          VariableNameMap outputs = op->GetOutputs();
-          for (VariableNameMap::iterator it = outputs.begin();
-               it != outputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          input_var_name = op->Input("X")[0];
-          output_var_name = op->Output("Out")[0];
-          std::shared_ptr<operators::Reshape2Op<Dtype, float>> op_ptr =
-              std::make_shared<operators::Reshape2Op<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          return;
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t) {
-    auto scope = program_.scope.get();
-    Variable *input_feed_value = scope->Var(input_var_name);
-    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
-    tensor_input->ShareDataWith(t);
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string input_var_name;
-  string output_var_name;
-
-  void predict(const Tensor &t, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestReshape2Op<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Reshape2 Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 4, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_ptr = input.data<float>();
-  for (int i = 0; i < 16; ++i) {
-    *(input_ptr + i) = i;
-  }
-  DLOG << "input : ";
-  for (int i = 0; i < input.numel(); ++i) {
-    DLOG << " index " << i << " : " << input_ptr[i];
-  }
-
-  paddle_mobile::framework::TestReshape2Op<paddle_mobile::CPU> testReshape2Op(
-      program);
-
-  auto output = testReshape2Op.predict(input);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_reshape_op.cpp b/mobile/test/operators/test_reshape_op.cpp
deleted file mode 100644
index ff3299f5e818d8169a356323213707417d747dba..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_reshape_op.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/reshape_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
-  }
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ReshapeOp<paddle_mobile::CPU, float>>
-      executor(program, "reshape");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
-  auto output =
-      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "input : ";
-  for (int j = 0; j < input.numel(); ++j) {
-    DLOG << " index " << j << " : " << input_ptr[j];
-  }
-
-  DLOG << "output : ";
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " index " << j << " : " << output_ptr[j];
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_resize_op.cpp b/mobile/test/operators/test_resize_op.cpp
deleted file mode 100644
index c452ef8d850f97f6988688c4e47d5041220cb828..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_resize_op.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/resize_op.h"
-
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
-  }
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::ResizeOp<paddle_mobile::CPU, float>>
-      executor(program, "resize");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {2, 3, 3, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({2, 9, 2});
-  auto output =
-      executor.Predict(input, "transpose_0.tmp_0", "reshape_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "input : ";
-  for (int j = 0; j < input.numel(); ++j) {
-    DLOG << " index " << j << " : " << input_ptr[j];
-  }
-
-  DLOG << "output : ";
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " index " << j << " : " << output_ptr[j];
-  }
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_scale_op.cpp b/mobile/test/operators/test_scale_op.cpp
deleted file mode 100644
index 574779d71e5ebc5f06fe5cd8fb33422726f39464..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_scale_op.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/scale_op.h"
-
-int main() {}
diff --git a/mobile/test/operators/test_sequence_expand_op.cpp b/mobile/test/operators/test_sequence_expand_op.cpp
deleted file mode 100644
index 731fc8e9e51dd4dc96b9571635ed86e4c42098ec..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_sequence_expand_op.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/sequence_ops/sequence_expand_op.h"
-
-namespace paddle_mobile {
-
-int TestSequenceExpandOp(const framework::LoDTensor &input_x,
-                         const framework::LoDTensor &input_y, int ref_level,
-                         framework::LoDTensor *output) {
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input_x"});
-  inputs["Y"] = std::vector<std::string>({"input_y"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_x_var = scope.get()->Var("input_x");
-  auto *x = input_x_var->template GetMutable<framework::LoDTensor>();
-  x->Resize(input_x.dims());
-  x->ShareDataWith(input_x);
-  x->set_lod(input_x.lod());
-  auto input_y_var = scope.get()->Var("input_y");
-  auto *y = input_y_var->template GetMutable<framework::LoDTensor>();
-  y->Resize(framework::make_ddim({0}));
-  y->mutable_data<float>();
-  y->set_lod(input_y.lod());
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["ref_level"].Set<int>(0);
-
-  auto *op = new operators::SequenceExpandOp<CPU, float>(
-      "sequence_expand", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto *out = output_var->template Get<framework::LoDTensor>();
-  output->Resize(out->dims());
-  output->ShareDataWith(*out);
-  output->set_lod(out->lod());
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-// namespace framework = paddle_mobile::framework;
-
-int main(int argc, char *argv[]) {
-  framework::LoDTensor input_x, input_y, output;
-  // case 1
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-    input_y.set_lod({{0, 2, 4}, {0, 3, 6, 7, 8}});
-
-    TestSequenceExpandOp(input_x, input_y, 0, &output);
-    std::vector<float> expect_data{1, 2, 1, 2, 3, 4, 3, 4};
-    std::vector<int> expect_lod{0, 2, 4, 6, 8};
-    for (int i = 0; i < 5; ++i) {
-      if (output.lod()[0][i] != expect_lod[i]) {
-        std::cerr << "output_lod[" << i << "]: " << output.lod()[0][i]
-                  << " != expect_lod[" << i << "]: " << expect_lod[i]
-                  << std::endl;
-        return 1;
-      }
-    }
-    for (int i = 0; i < 8; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        std::cerr << "output[" << i << "]: " << output.data<float>()[i]
-                  << " != expect[" << i << "]: " << expect_data[i] << std::endl;
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_sequence_pool_op.cpp b/mobile/test/operators/test_sequence_pool_op.cpp
deleted file mode 100644
index de945c9ec0f9a145c57a4632489038a4d0e28c24..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_sequence_pool_op.cpp
+++ /dev/null
@@ -1,293 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <iostream>
-#include "../test_include.h"
-#include "operators/sequence_ops/sequence_pool_op.h"
-
-namespace paddle_mobile {
-
-int TestSequencePoolOp(const framework::LoDTensor &input_x,
-                       const std::string pool_type,
-                       framework::LoDTensor *output) {
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input_x"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_x_var = scope.get()->Var("input_x");
-  auto *x = input_x_var->template GetMutable<framework::LoDTensor>();
-  x->Resize(input_x.dims());
-  x->ShareDataWith(input_x);
-  x->set_lod(input_x.lod());
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  attrs["pooltype"].Set<std::string>(pool_type);
-
-  auto *op = new operators::SequencePoolOp<CPU, float>(
-      "sequence_pool", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto *out = output_var->template Get<framework::LoDTensor>();
-  output->Resize(out->dims());
-  output->ShareDataWith(*out);
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-// namespace framework = paddle_mobile::framework;
-
-int main(int argc, char *argv[]) {
-  framework::LoDTensor input_x, output;
-  // case 1
-  DLOG << "running max case 1";
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{2, 4};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 2
-  DLOG << "running max case 2";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    input_x.Resize(framework::make_ddim({data.size(), 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 3, 10}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{3, 10};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  DLOG << "running max case 3";
-  // case 3
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
-    input_x.Resize(framework::make_ddim({4, 2}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{3, 4, 7, 8};
-    for (int i = 0; i < 4; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 4
-  DLOG << "running max case 4";
-  {
-    std::vector<float> data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                            11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-    input_x.Resize(framework::make_ddim({4, 5}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "MAX", &output);
-    std::vector<float> expect_data{6, 7, 8, 9, 10, 16, 17, 18, 19, 20};
-    for (int i = 0; i < 10; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 1
-  DLOG << "running sum case 1";
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{3, 7};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 2
-  DLOG << "running sum case 2";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    input_x.Resize(framework::make_ddim({data.size(), 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 3, 10}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{6, 49};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 3
-  DLOG << "running sum case 3";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
-    input_x.Resize(framework::make_ddim({4, 2}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{4, 6, 12, 14};
-    for (int i = 0; i < 4; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 4
-  DLOG << "running sum case 4";
-  {
-    std::vector<float> data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                            11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-    input_x.Resize(framework::make_ddim({4, 5}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "SUM", &output);
-    std::vector<float> expect_data{7, 9, 11, 13, 15, 27, 29, 31, 33, 35};
-    for (int i = 0; i < 10; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 1
-  DLOG << "running first case 1";
-  {
-    std::vector<float> data{1, 2, 3, 4};
-    input_x.Resize(framework::make_ddim({4, 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < 4; ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 3};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 2
-  DLOG << "running first case 2";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    input_x.Resize(framework::make_ddim({data.size(), 1}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 3, 10}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 4};
-    for (int i = 0; i < 2; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 3
-  DLOG << "running first case 3";
-  {
-    std::vector<float> data{1, 2, 3, 4, 5, 6, 7, 8};
-    input_x.Resize(framework::make_ddim({4, 2}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 2, 5, 6};
-    for (int i = 0; i < 4; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  // case 4
-  DLOG << "running first case 4";
-  {
-    std::vector<float> data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10,
-                            11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-    input_x.Resize(framework::make_ddim({4, 5}));
-    float *in_data = input_x.mutable_data<float>();
-    for (int i = 0; i < data.size(); ++i) in_data[i] = data[i];
-    input_x.set_lod({{0, 2, 4}});
-
-    TestSequencePoolOp(input_x, "FIRST", &output);
-    std::vector<float> expect_data{1, 2, 3, 4, 5, 11, 12, 13, 14, 15};
-    for (int i = 0; i < 10; ++i) {
-      if (output.data<float>()[i] != expect_data[i]) {
-        DLOG << "output[" << i << "]: " << output.data<float>()[i]
-             << " != expect[" << i << "]: " << expect_data[i];
-        return 1;
-      }
-    }
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_sequence_softmax_op.cpp b/mobile/test/operators/test_sequence_softmax_op.cpp
deleted file mode 100644
index d8e67f456fb7f8bb98936e27ceecd163f8966824..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_sequence_softmax_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <limits>
-#include "../test_include.h"
-#include "operators/sequence_ops/sequence_softmax_op.h"
-
-namespace paddle_mobile {
-
-void SequenceSoftmax(const framework::LoDTensor *X, framework::LoDTensor *Y) {
-  const float *x = X->data<float>();
-  const auto &lod = X->lod().back();
-  float *y = Y->mutable_data<float>();
-  for (int batch = 0; batch < lod.size() - 1; ++batch) {
-    int num_classes = lod[batch + 1] - lod[batch];
-    size_t offset = lod[batch];
-    const float *input = x + offset;
-    float *output = y + offset;
-    float max = -std::numeric_limits<float>::max();
-    for (int j = 0; j < num_classes; ++j) {
-      max = (input[j] > max) ? input[j] : max;
-    }
-    float sum = 0.f;
-    for (int j = 0; j < num_classes; ++j) {
-      float tmp = expf(input[j] - max);
-      sum += tmp;
-      output[j] = tmp;
-    }
-    for (int j = 0; j < num_classes; ++j) {
-      output[j] /= sum;
-    }
-  }
-  Y->set_lod(X->lod());
-}
-
-int TestSequenceSoftmaxOp(const std::vector<int> &input_shape,
-                          const std::vector<size_t> &input_lod) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-  input->set_lod({input_lod});
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::SequenceSoftmaxOp<CPU, float>(
-      "sequence_softmax", inputs, outputs, attrs, scope.get());
-
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::LoDTensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  SequenceSoftmax(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestSequenceSoftmaxOp({2, 1}, {0, 2});
-  TestSequenceSoftmaxOp({100, 1}, {0, 3, 100});
-  TestSequenceSoftmaxOp({100, 1}, {0, 50, 100});
-  return 0;
-}
diff --git a/mobile/test/operators/test_sigmoid_op.cpp b/mobile/test/operators/test_sigmoid_op.cpp
deleted file mode 100644
index bda7a79d943ef6afa6c57a7b30eeb4ae5a880015..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_sigmoid_op.cpp
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Sigmoid(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = 1.f / (1.f + exp(-x[i]));
-  }
-}
-
-int TestSigmoidOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::SigmoidOp<CPU, float>("sigmoid", inputs, outputs,
-                                                  attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Sigmoid(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestSigmoidOp({1, 1, 2, 3});
-  paddle_mobile::TestSigmoidOp({1, 3, 11, 22});
-  paddle_mobile::TestSigmoidOp({1, 32, 112, 112});
-  return 0;
-}
diff --git a/mobile/test/operators/test_slice_op.cpp b/mobile/test/operators/test_slice_op.cpp
deleted file mode 100644
index 9306bc53c6ae23b10c27a71071c11c9ddf1c0d25..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_slice_op.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/slice_op.h"
-
-int main() {}
diff --git a/mobile/test/operators/test_softmax_op.cpp b/mobile/test/operators/test_softmax_op.cpp
deleted file mode 100644
index e9ccb260b521f76a82d170715e843bfb151cadb2..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_softmax_op.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <limits>
-#include "../test_include.h"
-#include "operators/softmax_op.h"
-
-namespace paddle_mobile {
-
-void Softmax(const framework::Tensor *X, framework::Tensor *Y) {
-  const framework::DDim &dims = X->dims();
-  int batch_size = dims[0];
-  int num_classes = dims[dims.size() - 1];
-  int channels = X->numel() / batch_size / num_classes;
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int batch = 0; batch < batch_size; ++batch) {
-    for (int c = 0; c < channels; ++c) {
-      size_t offset = (batch * channels + c) * num_classes;
-      const float *input = x + offset;
-      float *output = y + offset;
-      float max = -std::numeric_limits<float>::max();
-      for (int j = 0; j < num_classes; ++j) {
-        max = (input[j] > max) ? input[j] : max;
-      }
-      float sum = 0.f;
-      for (int j = 0; j < num_classes; ++j) {
-        float tmp = expf(input[j] - max);
-        sum += tmp;
-        output[j] = tmp;
-      }
-      for (int j = 0; j < num_classes; ++j) {
-        output[j] /= sum;
-      }
-    }
-  }
-}
-
-int TestSoftmaxOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::SoftmaxOp<CPU, float>("softmax", inputs, outputs,
-                                                  attrs, scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Softmax(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestSoftmaxOp({128, 1000});
-  TestSoftmaxOp({128, 10, 1000});
-  return 0;
-}
diff --git a/mobile/test/operators/test_sum_op.cpp b/mobile/test/operators/test_sum_op.cpp
deleted file mode 100644
index 225a113f9071962027473e689cd38fab53906647..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_sum_op.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/sum_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestSumOp {
- public:
-  explicit TestSumOp(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "sum" && op->Input("X")[0] == "fc_2.tmp_0") {
-          DLOG << " sum attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-
-          std::shared_ptr<operators::SumOp<Dtype, float>> lrn =
-              std::make_shared<operators::SumOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(lrn);
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict_bn(const Tensor &t1, const Tensor &t2) {
-    // feed
-    auto scope = program_.scope.get();
-    Variable *x1_feed_value = scope->Var("fc_2.tmp_0");
-    auto tensor_x1 = x1_feed_value->GetMutable<LoDTensor>();
-    tensor_x1->ShareDataWith(t1);
-
-    Variable *x2_feed_value = scope->Var("fc_2.tmp_1");
-    auto tensor_x2 = x2_feed_value->GetMutable<LoDTensor>();
-    tensor_x2->ShareDataWith(t2);
-
-    Variable *output = scope->Var("fc_2.tmp_2");
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({2, 96});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict_bn(t1, t2, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict_bn(const Tensor &t1, const Tensor &t2, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
-    }
-  }
-};
-
-template class TestSumOp<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Sum Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_eng) + "/model",
-                             std::string(g_eng) + "/params");
-
-  /// input x (4,10,2,2)
-  paddle_mobile::framework::Tensor inputx1;
-  SetupTensor<float>(&inputx1, {2, 96}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx1_ptr = inputx1.data<float>();
-
-  paddle_mobile::framework::Tensor inputx2;
-  SetupTensor<float>(&inputx2, {2, 96}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputx2_ptr = inputx2.data<float>();
-
-  paddle_mobile::framework::TestSumOp<paddle_mobile::CPU> testSumOp(program);
-
-  auto output_sum = testSumOp.predict_bn(inputx1, inputx2);
-  auto *output_sum_ptr = output_sum->data<float>();
-
-  DLOG << "input1 44: " << inputx1_ptr[44];
-  DLOG << "input2 44: " << inputx2_ptr[44];
-  DLOG << "out 44 :" << output_sum_ptr[44];
-
-  return 0;
-}
diff --git a/mobile/test/operators/test_tanh_op.cpp b/mobile/test/operators/test_tanh_op.cpp
deleted file mode 100644
index 13dfd09b3bbf2debfdf90a86215e209e75942157..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_tanh_op.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cmath>
-#include <iostream>
-#include "../test_include.h"
-#include "operators/activation_op.h"
-
-namespace paddle_mobile {
-
-void Tanh(const framework::Tensor *X, framework::Tensor *Y) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-
-  for (int i = 0; i < X->numel(); ++i) {
-    y[i] = 2.f / (1.f + exp(-2.f * x[i])) - 1.f;
-  }
-}
-
-int TestTanhOp(const std::vector<int> input_shape) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-
-  framework::AttributeMap attrs;
-  auto *op = new operators::TanhOp<CPU, float>("tanh", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  Tanh(input, &output_cmp);
-
-  const float *output_data = output->data<float>();
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (gap > 1e-5 && std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main() {
-  paddle_mobile::TestTanhOp({1, 1, 2, 3});
-  paddle_mobile::TestTanhOp({1, 3, 11, 22});
-  paddle_mobile::TestTanhOp({1, 32, 112, 112});
-  std::cout << "test sigmoid op pass." << std::endl;
-  return 0;
-}
diff --git a/mobile/test/operators/test_topk_op.cpp b/mobile/test/operators/test_topk_op.cpp
deleted file mode 100644
index cf0fde37055bd8c9ed87f5ef1560294ecf865936..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_topk_op.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <math.h>
-#include <limits>
-#include "../test_include.h"
-#include "operators/top_k_op.h"
-
-namespace paddle_mobile {
-
-void TopK(const framework::Tensor *X, framework::Tensor *Y,
-          framework::Tensor *Indices, const int K) {
-  const float *x = X->data<float>();
-  float *y = Y->mutable_data<float>();
-  int64_t *indices = Indices->mutable_data<int64_t>();
-
-  int dim_size = X->dims().size();
-  int row = 1;
-  int col = X->dims()[dim_size - 1];
-  for (int i = 0; i < dim_size - 1; ++i) {
-    row *= X->dims()[i];
-  }
-
-  std::vector<float> vec(col);
-  for (int i = 0; i < row; ++i) {
-    for (int j = 0; j < col; ++j) {
-      vec[j] = x[i * col + j];
-    }
-    for (int k = 0; k < K; ++k) {
-      float max = vec[0];
-      int index = 0;
-      for (int j = 1; j < col; ++j) {
-        if (vec[j] > max) {
-          max = vec[j];
-          index = j;
-        }
-      }
-      y[i * K + k] = max;
-      indices[i * K + k] = index;
-      vec[index] = -std::numeric_limits<float>::max();
-    }
-  }
-}
-
-int TestTopKOp(const std::vector<int> input_shape, const int K) {
-  framework::DDim dims = framework::make_ddim(input_shape);
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  auto scope = std::make_shared<framework::Scope>();
-  inputs["X"] = std::vector<std::string>({"input"});
-  outputs["Out"] = std::vector<std::string>({"output"});
-  outputs["Indices"] = std::vector<std::string>({"indices"});
-
-  auto input_var = scope.get()->Var("input");
-  auto input = input_var->template GetMutable<framework::LoDTensor>();
-  SetupTensor<float>(input, dims, -100.0, 100.0);
-
-  auto output_var = scope.get()->Var("output");
-  auto indices_var = scope.get()->Var("indices");
-
-  framework::AttributeMap attrs;
-  attrs["k"].Set<int>(K);
-  auto *op = new operators::TopKOp<CPU, float>("top_k", inputs, outputs, attrs,
-                                               scope.get());
-  op->InferShape();
-  op->Init();
-  op->Run();
-
-  auto output = output_var->template Get<framework::LoDTensor>();
-  auto indices = indices_var->template Get<framework::LoDTensor>();
-
-  framework::Tensor output_cmp, indices_cmp;
-  float *output_cmp_data = output_cmp.mutable_data<float>(output->dims());
-  int64_t *indices_cmp_data =
-      indices_cmp.mutable_data<int64_t>(indices->dims());
-  TopK(input, &output_cmp, &indices_cmp, K);
-
-  // sort output
-  float *output_data = const_cast<float *>(output->data<float>());
-  int64_t *indices_data = const_cast<int64_t *>(indices->data<int64_t>());
-  //  std::vector<std::pair<float, size_t>> vec(K);
-  //  for (int i = 0; i < output->numel() / K; ++i) {
-  //    for (int j = 0; j < K; ++j) {
-  //      vec[j] = std::move(std::make_pair(output_data[i * K + j],
-  //      indices_data[i * K + j]));
-  //    }
-  //    std::sort(vec.begin(), vec.end(),
-  //              [](const std::pair<float, size_t> &l,
-  //                 const std::pair<float, size_t> &r) {
-  //                   return l.first > r.first; });
-  //    for (int j = 0; j < K; ++j) {
-  //      output_data[i * K + j] = vec[j].first;
-  //      indices_data[i * K + j] = vec[j].second;
-  //    }
-  //  }
-
-  for (int i = 0; i < output->numel(); ++i) {
-    float gap = output_data[i] - output_cmp_data[i];
-    if (std::abs(gap / (output_data[i] + 1e-5)) > 1e-3) {
-      LOG(kLOG_INFO) << "output_data[" << i << "] = " << output_data[i]
-                     << ", output_cmp_data[" << i
-                     << "] = " << output_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-
-  for (int i = 0; i < indices->numel(); ++i) {
-    if (indices_data[i] != indices_cmp_data[i]) {
-      LOG(kLOG_INFO) << "indices_data[" << i << "] = " << indices_data[i]
-                     << ", indices_cmp_data[" << i
-                     << "] = " << indices_cmp_data[i];
-      delete op;
-      exit(1);
-    }
-  }
-  delete op;
-  return 0;
-}
-
-}  // namespace paddle_mobile
-
-int main(int argc, char *argv[]) {
-  TestTopKOp({1, 100}, 1);
-  TestTopKOp({128, 100}, 10);
-  TestTopKOp({128, 2, 100}, 10);
-  return 0;
-}
diff --git a/mobile/test/operators/test_transpose2_op.cpp b/mobile/test/operators/test_transpose2_op.cpp
deleted file mode 100644
index 4c4f5e4c2629b2f4c333f71045ed945c6ee9564a..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_transpose2_op.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_include.h"
-#include "operators/transpose2_op.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-template <typename Dtype>
-class TestTranspose2Op {
- public:
-  explicit TestTranspose2Op(const Program<Dtype> p) : program_(p) {
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    for (auto block_desc : blocks) {
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      for (auto op : ops) {
-        if (op->Type() == "transpose2") {
-          DLOG << " attr size: " << op->GetAttrMap().size();
-          std::unordered_map<std::string, Attribute> attrs = op->GetAttrMap();
-          for (std::unordered_map<std::string, Attribute>::iterator it =
-                   attrs.begin();
-               it != attrs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " inputs size: " << op->GetInputs().size();
-          VariableNameMap inputs = op->GetInputs();
-          for (VariableNameMap::iterator it = inputs.begin();
-               it != inputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          VariableNameMap outputs = op->GetOutputs();
-          for (VariableNameMap::iterator it = outputs.begin();
-               it != outputs.end(); ++it) {
-            DLOG << "  " << it->first << " " << it->second;
-          }
-
-          input_var_name = op->Input("X")[0];
-          output_var_name = op->Output("Out")[0];
-          std::shared_ptr<operators::Transpose2Op<Dtype, float>> op_ptr =
-              std::make_shared<operators::Transpose2Op<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope.get());
-          ops_of_block_[*block_desc.get()].push_back(op_ptr);
-          return;
-        }
-      }
-    }
-  }
-
-  std::shared_ptr<Tensor> predict(const Tensor &t) {
-    auto scope = program_.scope.get();
-    Variable *input_feed_value = scope->Var(input_var_name);
-    auto tensor_input = input_feed_value->GetMutable<LoDTensor>();
-    tensor_input->ShareDataWith(t);
-
-    Variable *output = scope->Var(output_var_name);
-    auto *output_tensor = output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({1, 2, 8});
-
-    std::shared_ptr<Tensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t, 0);
-
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-  string input_var_name;
-  string output_var_name;
-
-  void predict(const Tensor &t, int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      op->Run();
-    }
-  }
-};
-
-template class TestTranspose2Op<CPU>;
-}  // namespace framework
-}  // namespace paddle_mobile
-
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Transpose2 Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_ocr) + "/model",
-                             std::string(g_ocr) + "/params");
-
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 8, 2}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *input_ptr = input.data<float>();
-  for (int i = 0; i < 16; ++i) {
-    *(input_ptr + i) = i;
-  }
-  DLOG << "input : ";
-  for (int i = 0; i < input.numel(); ++i) {
-    DLOG << " index " << i << " : " << input_ptr[i];
-  }
-
-  paddle_mobile::framework::TestTranspose2Op<paddle_mobile::CPU>
-      testTranspose2Op(program);
-
-  auto output = testTranspose2Op.predict(input);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "output : ";
-  for (int i = 0; i < output->numel(); ++i) {
-    DLOG << " index " << i << " : " << output_ptr[i];
-  }
-  return 0;
-}
diff --git a/mobile/test/operators/test_transpose_op.cpp b/mobile/test/operators/test_transpose_op.cpp
deleted file mode 100644
index 263fdcfa0ed448b126f4b9cb01ace889318eeddb..0000000000000000000000000000000000000000
--- a/mobile/test/operators/test_transpose_op.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "../test_helper.h"
-#include "../test_include.h"
-#include "operators/transpose_op.h"
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_mobilenet_ssd));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
-  }
-  Executor4Test<paddle_mobile::CPU, paddle_mobile::operators::TransposeOp<
-                                        paddle_mobile::CPU, float>>
-      executor(program, "transpose");
-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 2, 3, 4}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto input_ptr = input.data<float>();
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 3, 4, 2});
-  auto output =
-      executor.Predict(input, "conv2d_22.tmp_1", "transpose_0.tmp_0", out_ddim);
-  auto *output_ptr = output->data<float>();
-
-  DLOG << "input : ";
-  for (int j = 0; j < input.numel(); ++j) {
-    DLOG << " index " << j << " : " << input_ptr[j];
-  }
-
-  DLOG << "output : ";
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " index " << j << " : " << output_ptr[j];
-  }
-  DLOG << " for example : ";
-  DLOG << " you can check if input[16] == output[9] ";
-  DLOG << " you can check if input[12] == output[1] ";
-  return 0;
-}
diff --git a/mobile/test/test_helper.h b/mobile/test/test_helper.h
deleted file mode 100644
index 98893eeac0e994bec4d4d45170e3d029236944df..0000000000000000000000000000000000000000
--- a/mobile/test/test_helper.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <fstream>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "common/common.h"
-#include "common/log.h"
-#include "framework/ddim.h"
-#include "framework/lod_tensor.h"
-
-static const char *g_ocr = "../models/ocr";
-static const char *g_mobilenet_ssd = "../models/mobilenet+ssd";
-static const char *g_genet_combine = "../models/enet";
-static const char *g_eng = "../models/eng_20conv_1_9_fc";
-static const char *g_mobilenet_ssd_gesture = "../models/mobilenet+ssd_gesture";
-static const char *g_mobilenet_combined = "../models/mobilenet_combine";
-static const char *g_googlenetv1_combined = "../models/googlenetv1_combine";
-static const char *g_mobilenet_detect = "../models/mobilenet-detect";
-static const char *g_squeezenet = "../models/squeezenet";
-static const char *g_googlenet = "../models/googlenet";
-static const char *g_googlenet_quali = "../models/googlenet_combine_quali";
-static const char *g_mobilenet = "../models/mobilenet";
-static const char *g_mobilenet_mul = "../models/r";
-static const char *g_alexnet = "../models/alexnet";
-static const char *g_inceptionv4 = "../models/inceptionv4";
-static const char *g_inceptionv3 =
-    "../models/InceptionV3_Spatial_Attention_Model";
-static const char *g_nlp = "../models/nlp";
-static const char *g_super = "../models/superresoltion";
-static const char *g_superv2 = "../models/superv2";
-static const char *g_resnet_50 = "../models/resnet_50";
-static const char *g_resnet = "../models/resnet";
-static const char *g_googlenet_combine = "../models/googlenet_combine";
-static const char *g_yolo = "../models/yolo";
-static const char *g_yolo_combined = "../models/yolo_combined";
-static const char *g_yolo_mul = "../models/d";
-static const char *g_fluid_fssd_new = "../models/fluid_fssd_new";
-static const char *g_vgg16_ssd_combined = "../models/vgg16_ssd_combined";
-static const char *g_mobilenet_vision = "../models/vision_mobilenet";
-static const char *g_yolo_vision = "../models/vision_yolo";
-static const char *g_test_image_1x3x224x224 =
-    "../images/test_image_1x3x224x224_float";
-static const char *g_test_image_1x3x224x224_banana =
-    "../images/input_3x224x224_banana";
-static const char *g_test_image_desktop_1_3_416_416_nchw_float =
-    "../images/in_put_1_3_416_416_2";
-static const char *g_hand = "../images/hand_image";
-static const char *g_moto = "../images/moto_300x300_float";
-static const char *g_imgfssd_ar = "../images/test_image_ssd_ar";
-static const char *g_imgfssd_ar1 = "../images/003_0001.txt";
-static const char *g_img = "../images/img.bin";
-static const char *g_yolo_img = "../images/in_put_1_3_416_416_2";
-static const char *g_super_img = "../images/mingren_input_data";
-static const char *g_mobilenet_img = "../images/image";
-static const char *g_test_image_1x3x224x224_vision_mobilenet_input =
-    "../images/vision_mobilenet_input";
-static const char *g_test_image_1x3x416x416_vision_yolo_input =
-    "../images/yolo_input";
-
-using namespace paddle_mobile;  // NOLINT
-using paddle_mobile::framework::DDim;
-using paddle_mobile::framework::LoDTensor;
-using paddle_mobile::framework::Tensor;
-
-template <typename T>
-void SetupTensor(paddle_mobile::framework::Tensor *input,
-                 paddle_mobile::framework::DDim dims, T lower, T upper) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  T *input_ptr = input->mutable_data<T>(dims);
-  for (int i = 0; i < input->numel(); ++i) {
-    input_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-  }
-}
-
-template <>
-void SetupTensor<bool>(paddle_mobile::framework::Tensor *input,
-                       paddle_mobile::framework::DDim dims, bool lower,
-                       bool upper) {
-  static unsigned int seed = 100;
-  std::mt19937 rng(seed++);
-  std::uniform_real_distribution<double> uniform_dist(0, 1);
-
-  bool *input_ptr = input->mutable_data<bool>(dims);
-  if (lower == upper) {
-    for (int i = 0; i < input->numel(); ++i) {
-      input_ptr[i] = lower;
-    }
-  } else {
-    for (int i = 0; i < input->numel(); ++i) {
-      input_ptr[i] = uniform_dist(rng) > 0.5;
-    }
-  }
-}
-
-template <typename T>
-T *CreateInput(Tensor *input, DDim dims, T low, T up) {
-  SetupTensor<T>(input, dims, static_cast<float>(low), static_cast<float>(up));
-  return input->data<T>();
-}
-
-template <typename T>
-void GetInput(const std::string &input_name, std::vector<T> *input,
-              const std::vector<int64_t> &dims) {
-  int size = 1;
-  for (const auto &dim : dims) {
-    size *= dim;
-  }
-
-  T *input_ptr = reinterpret_cast<T *>(malloc(sizeof(T) * size));
-  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read(reinterpret_cast<char *>(input_ptr), size * sizeof(T));
-  in.close();
-  for (int i = 0; i < size; ++i) {
-    input->push_back(input_ptr[i]);
-  }
-  free(input_ptr);
-}
-
-template <typename T>
-void GetInput(const std::string &input_name,
-              paddle_mobile::framework::Tensor *input,
-              paddle_mobile::framework::DDim dims) {
-  T *input_ptr = input->mutable_data<T>(dims);
-
-  std::ifstream in(input_name, std::ios::in | std::ios::binary);
-  in.read(reinterpret_cast<char *>(input_ptr), input->numel() * sizeof(T));
-  in.close();
-}
diff --git a/mobile/test/test_include.h b/mobile/test/test_include.h
deleted file mode 100644
index cce946848c0fa8323e23fb6cb65e31be7e3c0da1..0000000000000000000000000000000000000000
--- a/mobile/test/test_include.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <map>
-#include <string>
-#include <vector>
-
-#include "./test_helper.h"
-#include "common/enforce.h"
-#include "common/log.h"
-#include "common/util.h"
-#include "executor_for_test.h"
-#include "framework/ddim.h"
-#include "framework/lod_tensor.h"
-#include "framework/operator.h"
-#include "framework/program/block_desc.h"
-#include "framework/program/program.h"
-#include "framework/program/program_desc.h"
-#include "framework/scope.h"
-#include "framework/tensor.h"
-#include "framework/variable.h"
-#include "io/paddle_mobile.h"
-
-#ifdef PADDLE_MOBILE_CL
-#include "framework/cl/cl_image.h"
-#endif
diff --git a/mobile/third_party/opencl/.gitinore b/mobile/third_party/opencl/.gitinore
deleted file mode 100644
index 0c27d54300a9dc71c8a11fcc1d4a5e82c09c42db..0000000000000000000000000000000000000000
--- a/mobile/third_party/opencl/.gitinore
+++ /dev/null
@@ -1 +0,0 @@
-OpenCL-Headers
diff --git a/mobile/tools/android-cmake/android.toolchain.cmake b/mobile/tools/android-cmake/android.toolchain.cmake
deleted file mode 100644
index b897a473d9953ae1212d7ef513bcc932873ef3e3..0000000000000000000000000000000000000000
--- a/mobile/tools/android-cmake/android.toolchain.cmake
+++ /dev/null
@@ -1,784 +0,0 @@
-# Copyright (C) 2016 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Configurable variables.
-# Modeled after the ndk-build system.
-# For any variables defined in:
-#         https://developer.android.com/ndk/guides/android_mk.html
-#         https://developer.android.com/ndk/guides/application_mk.html
-# if it makes sense for CMake, then replace LOCAL, APP, or NDK with ANDROID, and
-# we have that variable below.
-# The exception is ANDROID_TOOLCHAIN vs NDK_TOOLCHAIN_VERSION.
-# Since we only have one version of each gcc and clang, specifying a version
-# doesn't make much sense.
-#
-# ANDROID_TOOLCHAIN
-# ANDROID_ABI
-# ANDROID_PLATFORM
-# ANDROID_STL
-# ANDROID_PIE
-# ANDROID_CPP_FEATURES
-# ANDROID_ALLOW_UNDEFINED_SYMBOLS
-# ANDROID_ARM_MODE
-# ANDROID_ARM_NEON
-# ANDROID_DISABLE_NO_EXECUTE
-# ANDROID_DISABLE_RELRO
-# ANDROID_DISABLE_FORMAT_STRING_CHECKS
-# ANDROID_CCACHE
-
-# cmake_minimum_required(VERSION 3.6.0)
-
-# Inhibit all of CMake's own NDK handling code.
-set(CMAKE_SYSTEM_VERSION 1)
-
-# CMake invokes the toolchain file twice during the first build, but only once
-# during subsequent rebuilds. This was causing the various flags to be added
-# twice on the first build, and on a rebuild ninja would see only one set of the
-# flags and rebuild the world.
-# https://github.com/android-ndk/ndk/issues/323
-if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-  return()
-endif(ANDROID_NDK_TOOLCHAIN_INCLUDED)
-set(ANDROID_NDK_TOOLCHAIN_INCLUDED true)
-
-# Android NDK
-if(NOT ANDROID_NDK)
-  get_filename_component(ANDROID_NDK "$ENV{NDK_ROOT}" ABSOLUTE)
-else()
-  # Allow the user to specify their own NDK path, but emit a warning. This is an
-  # uncommon use case, but helpful if users want to use a bleeding edge
-  # toolchain file with a stable NDK.
-  # https://github.com/android-ndk/ndk/issues/473
-  message(WARNING "Using custom NDK path (ANDROID_NDK is set): ${ANDROID_NDK}")
-endif()
-file(TO_CMAKE_PATH "${ANDROID_NDK}" ANDROID_NDK)
-
-# Android NDK revision
-message("${ANDROID_NDK}")
-
-file(READ "${ANDROID_NDK}/source.properties" ANDROID_NDK_SOURCE_PROPERTIES)
-set(ANDROID_NDK_SOURCE_PROPERTIES_REGEX
-  "^Pkg\\.Desc = Android NDK\nPkg\\.Revision = ([0-9]+)\\.")
-if(NOT ANDROID_NDK_SOURCE_PROPERTIES MATCHES "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}")
-  message(SEND_ERROR "Failed to parse Android NDK revision: ${ANDROID_NDK}/source.properties.\n${ANDROID_NDK_SOURCE_PROPERTIES}")
-endif()
-string(REGEX REPLACE "${ANDROID_NDK_SOURCE_PROPERTIES_REGEX}" "\\1"
-  ANDROID_NDK_REVISION "${ANDROID_NDK_SOURCE_PROPERTIES}")
-
-# Touch toolchain variable to suppress "unused variable" warning.
-# This happens if CMake is invoked with the same command line the second time.
-if(CMAKE_TOOLCHAIN_FILE)
-endif()
-
-# Compatibility for configurable variables.
-# Compatible with configurable variables from the other toolchain file:
-#         https://github.com/taka-no-me/android-cmake
-# TODO: We should consider dropping compatibility to simplify things once most
-# of our users have migrated to our standard set of configurable variables.
-if(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_TOOLCHAIN)
-  if(ANDROID_TOOLCHAIN_NAME MATCHES "-clang([0-9].[0-9])?$")
-    set(ANDROID_TOOLCHAIN clang)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "-[0-9].[0-9]$")
-    set(ANDROID_TOOLCHAIN gcc)
-  endif()
-endif()
-if(ANDROID_ABI STREQUAL "armeabi-v7a with NEON")
-  set(ANDROID_ABI armeabi-v7a)
-  set(ANDROID_ARM_NEON TRUE)
-elseif(ANDROID_TOOLCHAIN_NAME AND NOT ANDROID_ABI)
-  if(ANDROID_TOOLCHAIN_NAME MATCHES "^arm-linux-androideabi-")
-    set(ANDROID_ABI armeabi-v7a)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^aarch64-linux-android-")
-    set(ANDROID_ABI arm64-v8a)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86-")
-    set(ANDROID_ABI x86)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^x86_64-")
-    set(ANDROID_ABI x86_64)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mipsel-linux-android-")
-    set(ANDROID_ABI mips)
-  elseif(ANDROID_TOOLCHAIN_NAME MATCHES "^mips64el-linux-android-")
-    set(ANDROID_ABI mips64)
-  endif()
-endif()
-if(ANDROID_NATIVE_API_LEVEL AND NOT ANDROID_PLATFORM)
-  if(ANDROID_NATIVE_API_LEVEL MATCHES "^android-[0-9]+$")
-    set(ANDROID_PLATFORM ${ANDROID_NATIVE_API_LEVEL})
-  elseif(ANDROID_NATIVE_API_LEVEL MATCHES "^[0-9]+$")
-    set(ANDROID_PLATFORM android-${ANDROID_NATIVE_API_LEVEL})
-  endif()
-endif()
-if(DEFINED ANDROID_APP_PIE AND NOT DEFINED ANDROID_PIE)
-  set(ANDROID_PIE "${ANDROID_APP_PIE}")
-endif()
-if(ANDROID_STL_FORCE_FEATURES AND NOT DEFINED ANDROID_CPP_FEATURES)
-  set(ANDROID_CPP_FEATURES "rtti exceptions")
-endif()
-if(DEFINED ANDROID_NO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  if(ANDROID_NO_UNDEFINED)
-    set(ANDROID_ALLOW_UNDEFINED_SYMBOLS FALSE)
-  else()
-    set(ANDROID_ALLOW_UNDEFINED_SYMBOLS TRUE)
-  endif()
-endif()
-if(DEFINED ANDROID_SO_UNDEFINED AND NOT DEFINED ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  set(ANDROID_ALLOW_UNDEFINED_SYMBOLS "${ANDROID_SO_UNDEFINED}")
-endif()
-if(DEFINED ANDROID_FORCE_ARM_BUILD AND NOT ANDROID_ARM_MODE)
-  if(ANDROID_FORCE_ARM_BUILD)
-    set(ANDROID_ARM_MODE arm)
-  else()
-    set(ANDROID_ARM_MODE thumb)
-  endif()
-endif()
-if(DEFINED ANDROID_NOEXECSTACK AND NOT DEFINED ANDROID_DISABLE_NO_EXECUTE)
-  if(ANDROID_NOEXECSTACK)
-    set(ANDROID_DISABLE_NO_EXECUTE FALSE)
-  else()
-    set(ANDROID_DISABLE_NO_EXECUTE TRUE)
-  endif()
-endif()
-if(DEFINED ANDROID_RELRO AND NOT DEFINED ANDROID_DISABLE_RELRO)
-  if(ANDROID_RELRO)
-    set(ANDROID_DISABLE_RELRO FALSE)
-  else()
-    set(ANDROID_DISABLE_RELRO TRUE)
-  endif()
-endif()
-if(NDK_CCACHE AND NOT ANDROID_CCACHE)
-  set(ANDROID_CCACHE "${NDK_CCACHE}")
-endif()
-
-# Default values for configurable variables.
-if(NOT ANDROID_TOOLCHAIN)
-  set(ANDROID_TOOLCHAIN gcc)
-endif()
-if(NOT ANDROID_ABI)
-  set(ANDROID_ABI armeabi-v7a)
-endif()
-if(ANDROID_PLATFORM MATCHES "^android-([0-9]|1[0-3])$")
-  set(ANDROID_PLATFORM android-14)
-elseif(ANDROID_PLATFORM STREQUAL android-20)
-  set(ANDROID_PLATFORM android-19)
-elseif(ANDROID_PLATFORM STREQUAL android-25)
-  set(ANDROID_PLATFORM android-24)
-elseif(NOT ANDROID_PLATFORM)
-  set(ANDROID_PLATFORM android-14)
-endif()
-string(REPLACE "android-" "" ANDROID_PLATFORM_LEVEL ${ANDROID_PLATFORM})
-if(ANDROID_ABI MATCHES "64(-v8a)?$" AND ANDROID_PLATFORM_LEVEL LESS 21)
-  set(ANDROID_PLATFORM android-21)
-  set(ANDROID_PLATFORM_LEVEL 21)
-endif()
-if(NOT ANDROID_STL)
-  set(ANDROID_STL gnustl_static)
-endif()
-if(NOT DEFINED ANDROID_PIE)
-  if(ANDROID_PLATFORM_LEVEL LESS 16)
-    set(ANDROID_PIE FALSE)
-  else()
-    set(ANDROID_PIE TRUE)
-  endif()
-endif()
-if(NOT ANDROID_ARM_MODE)
-  set(ANDROID_ARM_MODE thumb)
-endif()
-
-# Export configurable variables for the try_compile() command.
-set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES
-  ANDROID_TOOLCHAIN
-  ANDROID_ABI
-  ANDROID_PLATFORM
-  ANDROID_STL
-  ANDROID_PIE
-  ANDROID_CPP_FEATURES
-  ANDROID_ALLOW_UNDEFINED_SYMBOLS
-  ANDROID_ARM_MODE
-  ANDROID_ARM_NEON
-  ANDROID_DISABLE_NO_EXECUTE
-  ANDROID_DISABLE_RELRO
-  ANDROID_DISABLE_FORMAT_STRING_CHECKS
-  ANDROID_CCACHE)
-
-# Standard cross-compiling stuff.
-set(ANDROID TRUE)
-set(CMAKE_SYSTEM_NAME Android)
-
-# Allow users to override these values in case they want more strict behaviors.
-# For example, they may want to prevent the NDK's libz from being picked up so
-# they can use their own.
-# https://github.com/android-ndk/ndk/issues/517
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PROGRAM)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_LIBRARY)
-  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_INCLUDE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endif()
-
-if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
-  set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
-endif()
-
-# ABI.
-set(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-  set(ANDROID_SYSROOT_ABI arm)
-  set(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_HEADER_TRIPLE arm-linux-androideabi)
-  if(ANDROID_ABI STREQUAL armeabi)
-    message(WARNING "armeabi is deprecated and will be removed in a future NDK "
-                    "release.")
-    set(CMAKE_SYSTEM_PROCESSOR armv5te)
-    set(ANDROID_LLVM_TRIPLE armv5te-none-linux-androideabi)
-  elseif(ANDROID_ABI STREQUAL armeabi-v7a)
-    set(CMAKE_SYSTEM_PROCESSOR armv7-a)
-    set(ANDROID_LLVM_TRIPLE armv7-none-linux-androideabi)
-  endif()
-elseif(ANDROID_ABI STREQUAL arm64-v8a)
-  set(ANDROID_SYSROOT_ABI arm64)
-  set(CMAKE_SYSTEM_PROCESSOR aarch64)
-  set(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_LLVM_TRIPLE aarch64-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE aarch64-linux-android)
-elseif(ANDROID_ABI STREQUAL x86)
-  set(ANDROID_SYSROOT_ABI x86)
-  set(CMAKE_SYSTEM_PROCESSOR i686)
-  set(ANDROID_TOOLCHAIN_NAME i686-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI})
-  set(ANDROID_LLVM_TRIPLE i686-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE i686-linux-android)
-elseif(ANDROID_ABI STREQUAL x86_64)
-  set(ANDROID_SYSROOT_ABI x86_64)
-  set(CMAKE_SYSTEM_PROCESSOR x86_64)
-  set(ANDROID_TOOLCHAIN_NAME x86_64-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_ABI})
-  set(ANDROID_LLVM_TRIPLE x86_64-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE x86_64-linux-android)
-elseif(ANDROID_ABI STREQUAL mips)
-  message(WARNING "mips is deprecated and will be removed in a future NDK "
-                  "release.")
-  set(ANDROID_SYSROOT_ABI mips)
-  set(CMAKE_SYSTEM_PROCESSOR mips)
-  set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_LLVM_TRIPLE mipsel-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE mipsel-linux-android)
-elseif(ANDROID_ABI STREQUAL mips64)
-  message(WARNING "mips64 is deprecated and will be removed in a future NDK "
-                  "release.")
-  set(ANDROID_SYSROOT_ABI mips64)
-  set(CMAKE_SYSTEM_PROCESSOR mips64)
-  set(ANDROID_TOOLCHAIN_NAME mips64el-linux-android)
-  set(ANDROID_TOOLCHAIN_ROOT ${ANDROID_TOOLCHAIN_NAME})
-  set(ANDROID_LLVM_TRIPLE mips64el-none-linux-android)
-  set(ANDROID_HEADER_TRIPLE mips64el-linux-android)
-else()
-  message(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
-endif()
-
-set(ANDROID_COMPILER_FLAGS)
-set(ANDROID_COMPILER_FLAGS_CXX)
-set(ANDROID_COMPILER_FLAGS_DEBUG)
-set(ANDROID_COMPILER_FLAGS_RELEASE)
-set(ANDROID_LINKER_FLAGS)
-set(ANDROID_LINKER_FLAGS_EXE)
-
-# Don't re-export libgcc symbols in every binary.
-list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libgcc.a)
-list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libatomic.a)
-
-# STL.
-set(ANDROID_STL_STATIC_LIBRARIES)
-set(ANDROID_STL_SHARED_LIBRARIES)
-if(ANDROID_STL STREQUAL system)
-  if(NOT "x${ANDROID_CPP_FEATURES}" STREQUAL "x")
-    set(ANDROID_STL_STATIC_LIBRARIES supc++)
-  endif()
-elseif(ANDROID_STL STREQUAL stlport_static)
-  set(ANDROID_STL_STATIC_LIBRARIES stlport_static)
-elseif(ANDROID_STL STREQUAL stlport_shared)
-  set(ANDROID_STL_SHARED_LIBRARIES stlport_shared)
-elseif(ANDROID_STL STREQUAL gnustl_static)
-  set(ANDROID_STL_STATIC_LIBRARIES gnustl_static)
-elseif(ANDROID_STL STREQUAL gnustl_shared)
-  set(ANDROID_STL_STATIC_LIBRARIES supc++)
-  set(ANDROID_STL_SHARED_LIBRARIES gnustl_shared)
-elseif(ANDROID_STL STREQUAL c++_static)
-  set(ANDROID_STL_STATIC_LIBRARIES c++)
-elseif(ANDROID_STL STREQUAL c++_shared)
-  set(ANDROID_STL_SHARED_LIBRARIES c++)
-elseif(ANDROID_STL STREQUAL none)
-else()
-  message(FATAL_ERROR "Invalid Android STL: ${ANDROID_STL}.")
-endif()
-
-# Behavior of CMAKE_SYSTEM_LIBRARY_PATH and CMAKE_LIBRARY_PATH are really weird
-# when CMAKE_SYSROOT is set. The library path is appended to the sysroot even if
-# the library path is an abspath. Using a relative path from the sysroot doesn't
-# work either, because the relative path is abspath'd relative to the current
-# CMakeLists.txt file before being appended :(
-#
-# We can try to get out of this problem by providing another root path for cmake
-# to check. CMAKE_FIND_ROOT_PATH is intended for this purpose:
-# https://cmake.org/cmake/help/v3.8/variable/CMAKE_FIND_ROOT_PATH.html
-#
-# In theory this should just be our sysroot, but since we don't have a single
-# sysroot that is correct (there's only one set of headers, but multiple
-# locations for libraries that need to be handled differently).  Some day we'll
-# want to move all the libraries into ${ANDROID_NDK}/sysroot, but we'll need to
-# make some fixes to Clang, various build systems, and possibly CMake itself to
-# get that working.
-list(APPEND CMAKE_FIND_ROOT_PATH "${ANDROID_NDK}")
-
-# Sysroot.
-set(CMAKE_SYSROOT "${ANDROID_NDK}/sysroot")
-
-# CMake 3.9 tries to use CMAKE_SYSROOT_COMPILE before it gets set from
-# CMAKE_SYSROOT, which leads to using the system's /usr/include. Set this
-# manually.
-# https://github.com/android-ndk/ndk/issues/467
-set(CMAKE_SYSROOT_COMPILE "${CMAKE_SYSROOT}")
-
-# The compiler driver doesn't check any arch specific include locations (though
-# maybe we should add that). Architecture specific headers like asm/ and
-# machine/ are installed to an arch-$ARCH subdirectory of the sysroot.
-list(APPEND ANDROID_COMPILER_FLAGS
-  "-isystem ${CMAKE_SYSROOT}/usr/include/${ANDROID_HEADER_TRIPLE}")
-list(APPEND ANDROID_COMPILER_FLAGS
-  "-D__ANDROID_API__=${ANDROID_PLATFORM_LEVEL}")
-
-# We need different sysroots for linking and compiling, but cmake doesn't
-# support that. Pass the sysroot flag manually when linking.
-set(ANDROID_SYSTEM_LIBRARY_PATH
-  "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}/arch-${ANDROID_SYSROOT_ABI}")
-list(APPEND ANDROID_LINKER_FLAGS "--sysroot ${ANDROID_SYSTEM_LIBRARY_PATH}")
-
-# find_library searches a handful of paths as described by
-# https://cmake.org/cmake/help/v3.6/command/find_library.html.  Since libraries
-# are per-API level and headers aren't, We don't have libraries in the
-# CMAKE_SYSROOT. Set up CMAKE_SYSTEM_LIBRARY_PATH
-# (https://cmake.org/cmake/help/v3.6/variable/CMAKE_SYSTEM_LIBRARY_PATH.html)
-# instead.
-#
-# NB: The suffix is just lib here instead of dealing with lib64 because
-# apparently CMake does some automatic rewriting of that? I've been testing by
-# building my own CMake with a bunch of logging added, and that seems to be the
-# case.
-list(APPEND CMAKE_SYSTEM_LIBRARY_PATH
-  "${ANDROID_SYSTEM_LIBRARY_PATH}/usr/lib")
-
-# Toolchain.
-if(CMAKE_HOST_SYSTEM_NAME STREQUAL Linux)
-  set(ANDROID_HOST_TAG linux-x86_64)
-elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Darwin)
-  set(ANDROID_HOST_TAG darwin-x86_64)
-elseif(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
-  set(ANDROID_HOST_TAG windows-x86_64)
-endif()
-set(ANDROID_TOOLCHAIN_ROOT "${ANDROID_NDK}/toolchains/${ANDROID_TOOLCHAIN_ROOT}-4.9/prebuilt/${ANDROID_HOST_TAG}")
-set(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
-if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows)
-  set(ANDROID_TOOLCHAIN_SUFFIX .exe)
-endif()
-
-set(ANDROID_HOST_PREBUILTS "${ANDROID_NDK}/prebuilt/${ANDROID_HOST_TAG}")
-
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  set(ANDROID_LLVM_TOOLCHAIN_PREFIX "${ANDROID_NDK}/toolchains/llvm/prebuilt/${ANDROID_HOST_TAG}/bin/")
-  set(ANDROID_C_COMPILER   "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_CXX_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang++${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_ASM_COMPILER "${ANDROID_LLVM_TOOLCHAIN_PREFIX}clang${ANDROID_TOOLCHAIN_SUFFIX}")
-  # Clang can fail to compile if CMake doesn't correctly supply the target and
-  # external toolchain, but to do so, CMake needs to already know that the
-  # compiler is clang. Tell CMake that the compiler is really clang, but don't
-  # use CMakeForceCompiler, since we still want compile checks. We only want
-  # to skip the compiler ID detection step.
-  set(CMAKE_C_COMPILER_ID_RUN TRUE)
-  set(CMAKE_CXX_COMPILER_ID_RUN TRUE)
-  set(CMAKE_C_COMPILER_ID Clang)
-  set(CMAKE_CXX_COMPILER_ID Clang)
-  set(CMAKE_C_COMPILER_VERSION 3.8)
-  set(CMAKE_CXX_COMPILER_VERSION 3.8)
-  set(CMAKE_C_STANDARD_COMPUTED_DEFAULT 11)
-  set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT 98)
-  set(CMAKE_C_COMPILER_TARGET   ${ANDROID_LLVM_TRIPLE})
-  set(CMAKE_CXX_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE})
-  set(CMAKE_ASM_COMPILER_TARGET ${ANDROID_LLVM_TRIPLE})
-  set(CMAKE_C_COMPILER_EXTERNAL_TOOLCHAIN   "${ANDROID_TOOLCHAIN_ROOT}")
-  set(CMAKE_CXX_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}")
-  set(CMAKE_ASM_COMPILER_EXTERNAL_TOOLCHAIN "${ANDROID_TOOLCHAIN_ROOT}")
-  set(ANDROID_AR "${ANDROID_TOOLCHAIN_PREFIX}ar${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_RANLIB "${ANDROID_TOOLCHAIN_PREFIX}ranlib${ANDROID_TOOLCHAIN_SUFFIX}")
-elseif(ANDROID_TOOLCHAIN STREQUAL gcc)
-  set(ANDROID_C_COMPILER   "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_ASM_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_AR "${ANDROID_TOOLCHAIN_PREFIX}gcc-ar${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(ANDROID_RANLIB "${ANDROID_TOOLCHAIN_PREFIX}gcc-ranlib${ANDROID_TOOLCHAIN_SUFFIX}")
-else()
-  message(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}.")
-endif()
-
-if(NOT IS_DIRECTORY "${ANDROID_NDK}/platforms/${ANDROID_PLATFORM}")
-  message(FATAL_ERROR "Invalid Android platform: ${ANDROID_PLATFORM}.")
-elseif(NOT IS_DIRECTORY "${CMAKE_SYSROOT}")
-  message(FATAL_ERROR "Invalid Android sysroot: ${CMAKE_SYSROOT}.")
-endif()
-
-# Generic flags.
-list(APPEND ANDROID_COMPILER_FLAGS
-#  -g
-  -DANDROID
-  -ffunction-sections
-  -funwind-tables
-  -fstack-protector-strong
-  -no-canonical-prefixes)
-list(APPEND ANDROID_LINKER_FLAGS
-  -Wl,--build-id
-  -Wl,--warn-shared-textrel
-  -Wl,--fatal-warnings)
-list(APPEND ANDROID_LINKER_FLAGS_EXE
-  -Wl,--gc-sections
-  -Wl,-z,nocopyreloc)
-
-# Debug and release flags.
-list(APPEND ANDROID_COMPILER_FLAGS_DEBUG -O0)
-if(ANDROID_ABI MATCHES "^armeabi")
-  list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -Os)
-else()
-  list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -O2)
-endif()
-list(APPEND ANDROID_COMPILER_FLAGS_RELEASE -DNDEBUG)
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  list(APPEND ANDROID_COMPILER_FLAGS_DEBUG -fno-limit-debug-info)
-endif()
-
-# Toolchain and ABI specific flags.
-if(ANDROID_ABI STREQUAL armeabi)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -march=armv5te
-    -mtune=xscale
-    -msoft-float)
-endif()
-if(ANDROID_ABI STREQUAL armeabi-v7a)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -march=armv7-a
-    -mfloat-abi=softfp
-    -mfpu=vfpv3-d16)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,--fix-cortex-a8)
-endif()
-if(ANDROID_ABI STREQUAL mips)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -mips32)
-endif()
-if(ANDROID_ABI STREQUAL "mips64" AND ANDROID_TOOLCHAIN STREQUAL clang)
-  list(APPEND ANDROID_COMPILER_FLAGS "-fintegrated-as")
-endif()
-if(ANDROID_ABI MATCHES "^armeabi" AND ANDROID_TOOLCHAIN STREQUAL clang)
-  # Disable integrated-as for better compatibility.
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -fno-integrated-as)
-endif()
-if(ANDROID_ABI STREQUAL mips AND ANDROID_TOOLCHAIN STREQUAL clang)
-  # Help clang use mips64el multilib GCC
-  list(APPEND ANDROID_LINKER_FLAGS
-    "\"-L${ANDROID_TOOLCHAIN_ROOT}/lib/gcc/${ANDROID_TOOLCHAIN_NAME}/4.9.x/32/mips-r1\"")
-endif()
-if(ANDROID_ABI STREQUAL x86)
-  # http://b.android.com/222239
-  # http://b.android.com/220159 (internal http://b/31809417)
-  # x86 devices have stack alignment issues.
-  list(APPEND ANDROID_COMPILER_FLAGS -mstackrealign)
-endif()
-
-# STL specific flags.
-if(ANDROID_STL STREQUAL system)
-  set(ANDROID_STL_PREFIX gnu-libstdc++/4.9)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/system/include")
-elseif(ANDROID_STL MATCHES "^stlport_")
-  set(ANDROID_STL_PREFIX stlport)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/stlport"
-    "${ANDROID_NDK}/sources/cxx-stl/gabi++/include")
-elseif(ANDROID_STL MATCHES "^gnustl_")
-  set(ANDROID_STL_PREFIX gnu-libstdc++/4.9)
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include"
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/include"
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include/backward")
-elseif(ANDROID_STL MATCHES "^c\\+\\+_")
-  set(ANDROID_STL_PREFIX llvm-libc++)
-  if(ANDROID_ABI MATCHES "^armeabi")
-    list(APPEND ANDROID_LINKER_FLAGS -Wl,--exclude-libs,libunwind.a)
-  endif()
-  list(APPEND ANDROID_COMPILER_FLAGS_CXX
-    -std=c++11)
-  if(ANDROID_TOOLCHAIN STREQUAL gcc)
-    list(APPEND ANDROID_COMPILER_FLAGS_CXX
-      -fno-strict-aliasing)
-  endif()
-
-  # Add the libc++ lib directory to the path so the linker scripts can pick up
-  # the extra libraries.
-  list(APPEND ANDROID_LINKER_FLAGS
-    "-L${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}")
-
-  set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/include"
-    "${ANDROID_NDK}/sources/android/support/include"
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}abi/include")
-endif()
-set(ANDROID_CXX_STANDARD_LIBRARIES)
-foreach(library ${ANDROID_STL_STATIC_LIBRARIES})
-  list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.a")
-endforeach()
-foreach(library ${ANDROID_STL_SHARED_LIBRARIES})
-  list(APPEND ANDROID_CXX_STANDARD_LIBRARIES
-    "${ANDROID_NDK}/sources/cxx-stl/${ANDROID_STL_PREFIX}/libs/${ANDROID_ABI}/lib${library}.so")
-endforeach()
-set(CMAKE_C_STANDARD_LIBRARIES_INIT "-latomic -lm")
-set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_C_STANDARD_LIBRARIES_INIT}")
-if(ANDROID_CXX_STANDARD_LIBRARIES)
-  string(REPLACE ";" "\" \"" ANDROID_CXX_STANDARD_LIBRARIES "\"${ANDROID_CXX_STANDARD_LIBRARIES}\"")
-  set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_CXX_STANDARD_LIBRARIES}")
-endif()
-
-# Configuration specific flags.
-if(ANDROID_PIE)
-  set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
-  list(APPEND ANDROID_LINKER_FLAGS_EXE
-    -pie
-    -fPIE)
-endif()
-if(ANDROID_CPP_FEATURES)
-  separate_arguments(ANDROID_CPP_FEATURES)
-  foreach(feature ${ANDROID_CPP_FEATURES})
-    if(NOT ${feature} MATCHES "^(rtti|exceptions)$")
-      message(FATAL_ERROR "Invalid Android C++ feature: ${feature}.")
-    endif()
-    list(APPEND ANDROID_COMPILER_FLAGS_CXX
-      -f${feature})
-  endforeach()
-  string(REPLACE ";" " " ANDROID_CPP_FEATURES "${ANDROID_CPP_FEATURES}")
-endif()
-if(NOT ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,--no-undefined)
-endif()
-if(ANDROID_ABI MATCHES "armeabi")
-  if(ANDROID_ARM_MODE STREQUAL thumb)
-    list(APPEND ANDROID_COMPILER_FLAGS
-      -mthumb)
-  elseif(ANDROID_ARM_MODE STREQUAL arm)
-    list(APPEND ANDROID_COMPILER_FLAGS
-      -marm)
-  else()
-    message(FATAL_ERROR "Invalid Android ARM mode: ${ANDROID_ARM_MODE}.")
-  endif()
-  if(ANDROID_ABI STREQUAL armeabi-v7a AND ANDROID_ARM_NEON)
-    list(APPEND ANDROID_COMPILER_FLAGS
-      -mfpu=neon)
-  endif()
-endif()
-if(ANDROID_DISABLE_NO_EXECUTE)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wa,--execstack)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,execstack)
-else()
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wa,--noexecstack)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,noexecstack)
-endif()
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  # CMake automatically forwards all compiler flags to the linker,
-  # and clang doesn't like having -Wa flags being used for linking.
-  # To prevent CMake from doing this would require meddling with
-  # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Qunused-arguments)
-endif()
-if(ANDROID_DISABLE_RELRO)
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,norelro -Wl,-z,lazy)
-else()
-  list(APPEND ANDROID_LINKER_FLAGS
-    -Wl,-z,relro -Wl,-z,now)
-endif()
-if(ANDROID_DISABLE_FORMAT_STRING_CHECKS)
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wno-error=format-security)
-else()
-  list(APPEND ANDROID_COMPILER_FLAGS
-    -Wformat -Werror=format-security)
-endif()
-
-# Convert these lists into strings.
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS         "${ANDROID_COMPILER_FLAGS}")
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_CXX     "${ANDROID_COMPILER_FLAGS_CXX}")
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_DEBUG   "${ANDROID_COMPILER_FLAGS_DEBUG}")
-string(REPLACE ";" " " ANDROID_COMPILER_FLAGS_RELEASE "${ANDROID_COMPILER_FLAGS_RELEASE}")
-string(REPLACE ";" " " ANDROID_LINKER_FLAGS           "${ANDROID_LINKER_FLAGS}")
-string(REPLACE ";" " " ANDROID_LINKER_FLAGS_EXE       "${ANDROID_LINKER_FLAGS_EXE}")
-
-if(ANDROID_CCACHE)
-  set(CMAKE_C_COMPILER_LAUNCHER   "${ANDROID_CCACHE}")
-  set(CMAKE_CXX_COMPILER_LAUNCHER "${ANDROID_CCACHE}")
-endif()
-set(CMAKE_C_COMPILER        "${ANDROID_C_COMPILER}")
-set(CMAKE_CXX_COMPILER      "${ANDROID_CXX_COMPILER}")
-set(CMAKE_AR                "${ANDROID_AR}" CACHE FILEPATH "Archiver")
-set(CMAKE_RANLIB            "${ANDROID_RANLIB}" CACHE FILEPATH "Ranlib")
-set(_CMAKE_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_PREFIX}")
-
-if(ANDROID_ABI STREQUAL "x86" OR ANDROID_ABI STREQUAL "x86_64")
-  set(CMAKE_ASM_NASM_COMPILER
-    "${ANDROID_HOST_PREBUILTS}/bin/yasm${ANDROID_TOOLCHAIN_SUFFIX}")
-  set(CMAKE_ASM_NASM_COMPILER_ARG1 "-DELF")
-endif()
-
-# Set or retrieve the cached flags.
-# This is necessary in case the user sets/changes flags in subsequent
-# configures. If we included the Android flags in here, they would get
-# overwritten.
-set(CMAKE_C_FLAGS ""
-  CACHE STRING "Flags used by the compiler during all build types.")
-set(CMAKE_CXX_FLAGS ""
-  CACHE STRING "Flags used by the compiler during all build types.")
-set(CMAKE_ASM_FLAGS ""
-  CACHE STRING "Flags used by the compiler during all build types.")
-set(CMAKE_C_FLAGS_DEBUG ""
-  CACHE STRING "Flags used by the compiler during debug builds.")
-set(CMAKE_CXX_FLAGS_DEBUG ""
-  CACHE STRING "Flags used by the compiler during debug builds.")
-set(CMAKE_ASM_FLAGS_DEBUG ""
-  CACHE STRING "Flags used by the compiler during debug builds.")
-set(CMAKE_C_FLAGS_RELEASE ""
-  CACHE STRING "Flags used by the compiler during release builds.")
-set(CMAKE_CXX_FLAGS_RELEASE ""
-  CACHE STRING "Flags used by the compiler during release builds.")
-set(CMAKE_ASM_FLAGS_RELEASE ""
-  CACHE STRING "Flags used by the compiler during release builds.")
-set(CMAKE_MODULE_LINKER_FLAGS ""
-  CACHE STRING "Flags used by the linker during the creation of modules.")
-set(CMAKE_SHARED_LINKER_FLAGS ""
-  CACHE STRING "Flags used by the linker during the creation of dll's.")
-set(CMAKE_EXE_LINKER_FLAGS ""
-  CACHE STRING "Flags used by the linker.")
-
-set(CMAKE_C_FLAGS             "${ANDROID_COMPILER_FLAGS} ${CMAKE_C_FLAGS}")
-set(CMAKE_CXX_FLAGS           "${ANDROID_COMPILER_FLAGS} ${ANDROID_COMPILER_FLAGS_CXX} ${CMAKE_CXX_FLAGS}")
-set(CMAKE_ASM_FLAGS           "${ANDROID_COMPILER_FLAGS} ${CMAKE_ASM_FLAGS}")
-set(CMAKE_C_FLAGS_DEBUG       "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_C_FLAGS_DEBUG}")
-set(CMAKE_CXX_FLAGS_DEBUG     "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}")
-set(CMAKE_ASM_FLAGS_DEBUG     "${ANDROID_COMPILER_FLAGS_DEBUG} ${CMAKE_ASM_FLAGS_DEBUG}")
-set(CMAKE_C_FLAGS_RELEASE     "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_C_FLAGS_RELEASE}")
-set(CMAKE_CXX_FLAGS_RELEASE   "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}")
-set(CMAKE_ASM_FLAGS_RELEASE   "${ANDROID_COMPILER_FLAGS_RELEASE} ${CMAKE_ASM_FLAGS_RELEASE}")
-set(CMAKE_SHARED_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
-set(CMAKE_MODULE_LINKER_FLAGS "${ANDROID_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
-set(CMAKE_EXE_LINKER_FLAGS    "${ANDROID_LINKER_FLAGS} ${ANDROID_LINKER_FLAGS_EXE} ${CMAKE_EXE_LINKER_FLAGS}")
-
-# Compatibility for read-only variables.
-# Read-only variables for compatibility with the other toolchain file.
-# We'll keep these around for the existing projects that still use them.
-# TODO: All of the variables here have equivalents in our standard set of
-# configurable variables, so we can remove these once most of our users migrate
-# to those variables.
-set(ANDROID_NATIVE_API_LEVEL ${ANDROID_PLATFORM_LEVEL})
-if(ANDROID_ALLOW_UNDEFINED_SYMBOLS)
-  set(ANDROID_SO_UNDEFINED TRUE)
-else()
-  set(ANDROID_NO_UNDEFINED TRUE)
-endif()
-set(ANDROID_FUNCTION_LEVEL_LINKING TRUE)
-set(ANDROID_GOLD_LINKER TRUE)
-if(NOT ANDROID_DISABLE_NO_EXECUTE)
-  set(ANDROID_NOEXECSTACK TRUE)
-endif()
-if(NOT ANDROID_DISABLE_RELRO)
-  set(ANDROID_RELRO TRUE)
-endif()
-if(ANDROID_ARM_MODE STREQUAL arm)
-  set(ANDROID_FORCE_ARM_BUILD TRUE)
-endif()
-if(ANDROID_CPP_FEATURES MATCHES "rtti"
-    AND ANDROID_CPP_FEATURES MATCHES "exceptions")
-  set(ANDROID_STL_FORCE_FEATURES TRUE)
-endif()
-if(ANDROID_CCACHE)
-  set(NDK_CCACHE "${ANDROID_CCACHE}")
-endif()
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-clang)
-else()
-  set(ANDROID_TOOLCHAIN_NAME ${ANDROID_TOOLCHAIN_NAME}-4.9)
-endif()
-set(ANDROID_NDK_HOST_X64 TRUE)
-set(ANDROID_NDK_LAYOUT RELEASE)
-if(ANDROID_ABI STREQUAL armeabi)
-  set(ARMEABI TRUE)
-elseif(ANDROID_ABI STREQUAL armeabi-v7a)
-  set(ARMEABI_V7A TRUE)
-  if(ANDROID_ARM_NEON)
-    set(NEON TRUE)
-  endif()
-elseif(ANDROID_ABI STREQUAL arm64-v8a)
-  set(ARM64_V8A TRUE)
-elseif(ANDROID_ABI STREQUAL x86)
-  set(X86 TRUE)
-elseif(ANDROID_ABI STREQUAL x86_64)
-  set(X86_64 TRUE)
-elseif(ANDROID_ABI STREQUAL mips)
-  set(MIPS TRUE)
-elseif(ANDROID_ABI STREQUAL mips64)
-  set(MIPS64 TRUE)
-endif()
-set(ANDROID_NDK_HOST_SYSTEM_NAME ${ANDROID_HOST_TAG})
-set(ANDROID_NDK_ABI_NAME ${ANDROID_ABI})
-set(ANDROID_NDK_RELEASE r${ANDROID_NDK_REVISION})
-set(ANDROID_ARCH_NAME ${ANDROID_SYSROOT_ABI})
-set(ANDROID_SYSROOT "${CMAKE_SYSROOT}")
-set(TOOL_OS_SUFFIX ${ANDROID_TOOLCHAIN_SUFFIX})
-if(ANDROID_TOOLCHAIN STREQUAL clang)
-  set(ANDROID_COMPILER_IS_CLANG TRUE)
-endif()
-
-# CMake 3.7+ compatibility.
-if (CMAKE_VERSION VERSION_GREATER 3.7.0)
-  set(CMAKE_ANDROID_NDK ${ANDROID_NDK})
-
-  if(ANDROID_TOOLCHAIN STREQUAL gcc)
-    set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION 4.9)
-  else()
-    set(CMAKE_ANDROID_NDK_TOOLCHAIN_VERSION clang)
-  endif()
-
-  set(CMAKE_ANDROID_STL_TYPE ${ANDROID_STL})
-
-  if(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-    set(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
-    set(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-  endif()
-endif()
diff --git a/mobile/tools/android-debug-script/push2android.sh b/mobile/tools/android-debug-script/push2android.sh
deleted file mode 100644
index a367bb6a29ad0c48f915ad0e67385811df4d1012..0000000000000000000000000000000000000000
--- a/mobile/tools/android-debug-script/push2android.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env sh
-
-push_fn () {
-MODELS_PATH="../../test/models/*"
-MODELS_SRC="../../test/models"
-IMAGE_PATH="../../test/images/*"
-EXE_FILE="../../test/build/*"
-EXE_DIR="/data/local/tmp/bin"
-adb shell mkdir ${EXE_DIR}
-MODELS_DIR="/data/local/tmp/models"
-adb shell mkdir ${MODELS_DIR}
-for file in `ls ${MODELS_SRC}`
-do
-    adb shell mkdir ${MODELS_DIR}"/"${file}
-done
-
-if [[ -d "../../src/operators/kernel/mali/ACL_Android/build" ]]; then
-ACL_BUILD_PATH="../../src/operators/kernel/mali/ACL_Android/build/*"
-adb push ${ACL_BUILD_PATH} ${EXE_DIR}
-fi
-
-IMAGES_DIR="/data/local/tmp/images"
-adb shell mkdir ${IMAGES_DIR}
-LIB_PATH="../../build/release/arm-v7a/build/*"
-#LIB_PATH="../../build/release/arm-v8a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-for file in ${LIB_PATH}
-do
-    adb push ${file} ${EXE_DIR}
-done
-
-if [[ $1 != "npm" ]]; then
-adb push ${IMAGE_PATH} ${IMAGES_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-fi
-}
-
-if [[ $1 == "npm" ]]; then
-push_fn $1
-else
-push_fn
-fi
diff --git a/mobile/tools/android-debug-script/run_on_android.sh b/mobile/tools/android-debug-script/run_on_android.sh
deleted file mode 100644
index cb5a6348602693bb66747e8e3b8827efbb2ec7cf..0000000000000000000000000000000000000000
--- a/mobile/tools/android-debug-script/run_on_android.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env sh
-
-push_fn () {
-MODELS_PATH="../../test/models/*"
-MODELS_SRC="../../test/models"
-IMAGE_PATH="../../test/images/*"
-EXE_FILE="../../test/build/*"
-EXE_DIR="data/local/tmp/bin"
-adb shell mkdir ${EXE_DIR}
-MODELS_DIR="data/local/tmp/models"
-adb shell mkdir ${MODELS_DIR}
-for file in `ls ${MODELS_SRC}`
-do 
-    adb shell mkdir ${MODELS_DIR}"/"${file}
-done
-
-IMAGES_DIR="data/local/tmp/images"
-adb shell mkdir ${IMAGES_DIR}
-LIB_PATH="../../build/release/arm-v7a/build/*"
-adb push ${EXE_FILE} ${EXE_DIR}
-adb push ${LIB_PATH} ${EXE_DIR}
-if [[ $1 != "npm" ]]; then
-adb push ${IMAGE_PATH} ${IMAGES_DIR}
-adb push ${MODELS_PATH} ${MODELS_DIR}
-fi
-echo "test-op or test-net below : "
-adb shell ls /data/local/tmp/bin
-echo "**** choose OP or NET to test ****"
-read -p "which to test : " test_name
-adb shell "cd /data/local/tmp/bin; LD_LIBRARY_PATH=. ./${test_name}"
-}
-
-if [[ $1 == "npm" ]]; then
-push_fn $1
-else
-push_fn
-fi
diff --git a/mobile/tools/arm-platform.cmake b/mobile/tools/arm-platform.cmake
deleted file mode 100644
index 9f2b6d5e89d92255848af54321ea09ebdb058691..0000000000000000000000000000000000000000
--- a/mobile/tools/arm-platform.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-
-set(ARCH "armv7-a")
-
-set(FLOAT_ABI "softfp" CACHE STRING "-mfloat-api chosen")
-set_property(CACHE FLOAT_ABI PROPERTY STRINGS "softfp" "soft" "hard")
-
-set(FPU "neon")
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${ARCH} -mfloat-abi=${FLOAT_ABI} -mfpu=${FPU}")
diff --git a/mobile/tools/build.sh b/mobile/tools/build.sh
deleted file mode 100755
index 3dc579ecf09c20028d8f845876d35497c12fa35b..0000000000000000000000000000000000000000
--- a/mobile/tools/build.sh
+++ /dev/null
@@ -1,242 +0,0 @@
-#!/usr/bin/env bash
-NETS=""
-declare -a supportedNets=("googlenet" "mobilenet" "yolo" "squeezenet" "resnet" "mobilenetssd" "nlp" "mobilenetfssd" "genet" "super" "op")
-
-# merge cl to so
-merge_cl_to_so=1
-opencl_kernels="opencl_kernels.cpp"
-cd ../src/operators/kernel/cl
-if [[ -f "${opencl_kernels}" ]]; then
-    rm "${opencl_kernels}"
-fi
-python gen_code.py "${merge_cl_to_so}" > "${opencl_kernels}"
-cd -
-
-# get cl headers
-opencl_header_dir="../third_party/opencl/OpenCL-Headers"
-commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6"
-if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then
-    echo "pulling opencl headers"
-    cd $opencl_header_dir
-    git stash
-    git pull
-    git checkout $commit_id
-    cd -
-else
-    echo "cloning opencl headers"
-    rm -rf $opencl_header_dir
-    git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir
-    git checkout $commit_id
-fi
-
-build_for_mac() {
-    if [ ! `which brew` ]; then
-        echo "building failed! homebrew not found, please install homebrew."
-        return
-    fi
-    if [ ! `which cmake` ]; then
-        echo "installing cmake."
-        brew install cmake
-        if [ ! $? ]; then
-            echo "cmake install failed."
-            return
-        fi
-    fi
-    PLATFORM="x86"
-    MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"
-    mkdir -p ${BUILD_DIR}/build
-
-    mkdir -p ${BUILD_DIR}/test
-    cp -r ../test/models ${BUILD_DIR}/test/models
-
-    cmake .. \
-        -B"${BUILD_DIR}" \
-    	-DCMAKE_BUILD_TYPE="${MODE}" \
-    	-DIS_MAC=true
-
-    cd ${BUILD_DIR}
-    make -j 8
-}
-
-build_for_android() {
-    # rm -rf "../build"
-    if [ -z "${NDK_ROOT}" ]; then
-        echo "NDK_ROOT not found!"
-        exit -1
-    fi
-
-    if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a"  # Users could choose "arm-v8a" platform.
-        # PLATFORM="arm-v8a"
-    fi
-
-    if [ "${PLATFORM}" = "arm-v7a" ]; then
-        ABI="armeabi-v7a with NEON"
-        ARM_PLATFORM="V7"
-        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
-    elif [ "${PLATFORM}" = "arm-v8a" ]; then
-        ABI="arm64-v8a"
-        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold"
-    else
-        echo "unknown platform!"
-        exit -1
-    fi
-
-
-    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-19"
-    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
-    ANDROID_ARM_MODE="arm"
-
-    if [ "${#NETS}" -gt 1 ]; then
-    cmake .. \
-        -B"../build/release/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -DNET="${NETS}" \
-        -D"${ARM_PLATFORM}"=true
-    else
-
-    cmake .. \
-        -B"../build/release/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -D"${ARM_PLATFORM}"=true
-    fi
-    cd "../build/release/${PLATFORM}"
-    make -j 8
-    mkdir ./build/cl_kernel
-    cp ../../../src/operators/kernel/cl/cl_kernel/*  ./build/cl_kernel/
-}
-
-build_for_arm_linux() {
-    MODE="Release"
-    ARM_LINUX="arm-linux"
-
-    if [ "${#NETS}" -gt 1 ]; then
-        cmake .. \
-            -B"../build/release/arm-linux" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS=" " \
-            -DNET="${NETS}" \
-            -D"V7"=true
-    else
-        cmake .. \
-            -B"../build/release/arm-linux" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-            -DCMAKE_CXX_FLAGS=" " \
-            -DNET="${NETS}" \
-            -D"V7"=true
-    fi
-
-    cd "../build/release/arm-linux"
-    make -j 2
-
-    cd "../../../test/"
-    DIRECTORY="models"
-    if [ "`ls -A $DIRECTORY`" = "" ]; then
-        echo "$DIRECTORY is indeed empty pull images"
-        wget http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip
-        unzip paddle-mobile%2FmodelsAndImages.zip
-        mv modelsAndImages/images/ images
-        mv modelsAndImages/models/ models
-        rm -rf paddle-mobile%2FmodelsAndImages.zip
-        rm -rf __MACOS
-    else
-        echo "$DIRECTORY is indeed not empty, DONE!"
-    fi
-
-}
-
-build_for_ios() {
-#    rm -rf "../build"
-    PLATFORM="ios"
-    MODE="Release"
-    BUILD_DIR=../build/release/"${PLATFORM}"/
-    TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake"
-    mkdir -p "${BUILD_DIR}"
-    if [ "${#NETS}" -gt 1 ]; then
-        cmake .. \
-            -B"${BUILD_DIR}" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DIOS_PLATFORM=OS \
-            -DIOS_ARCH="${IOS_ARCH}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-            -DNET="${NETS}" \
-            -DIS_IOS="true"
-    else
-        cmake .. \
-            -B"${BUILD_DIR}" \
-            -DCMAKE_BUILD_TYPE="${MODE}" \
-            -DIOS_PLATFORM=OS \
-            -DIOS_ARCH="${IOS_ARCH}" \
-            -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-            -DIS_IOS="true"
-    fi
-    cd "${BUILD_DIR}"
-    make -j 8
-    cp ../../../src/io/ios_io/PaddleMobileCPU.h ./build/PaddleMobileCPU.h
-    cd ./build
-    # 生成符号表
-    ranlib *.a
-}
-
-build_error() {
-    echo "unknown target : $1"
-}
-
-if [ $# -lt 1 ]; then
-    echo "error: target missing!"
-    echo "available targets: ios|android"
-    echo "sample usage: ./build.sh android"
-else
-    params=($@)
-    for(( i=1; i<$#; i++ )); do
-        if [ ${i} != 1 ]; then
-            NETS=$NETS$";"
-        fi
-        NETS=$NETS$"${params[i]}"
-    done
-    params=${@:2}
-
-    supported=false
-    for name in ${params[@]}; do
-        for net in ${supportedNets[@]}; do
-            match=false
-            if [ "$name"x = "$net"x ];then
-                supported=true
-                match=true
-                break 1
-            fi
-        done
-        if [ "$match" = false ];then
-            echo "${name} not supported!"
-            echo "supported nets are: ${supportedNets[@]}"
-            exit -1
-        fi
-    done
-
-    if [ $1 = "android" ]; then
-        build_for_android
-    elif [ $1 = "arm_linux" ]; then
-        build_for_arm_linux
-    elif [ $1 = "ios" ]; then
-        build_for_ios
-    else
-        build_error "$1"
-    fi
-fi
diff --git a/mobile/tools/build_android_armv7.sh b/mobile/tools/build_android_armv7.sh
deleted file mode 100755
index 9466aa300ee6c1f6b79d4e7dd082cff7cc310eca..0000000000000000000000000000000000000000
--- a/mobile/tools/build_android_armv7.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env bash
-
-# merge cl to so
-merge_cl_to_so=1
-opencl_kernels="opencl_kernels.cpp"
-cd ../src/operators/kernel/cl
-if [[ -f "${opencl_kernels}" ]]; then
-    rm "${opencl_kernels}"
-fi
-python gen_code.py "${merge_cl_to_so}" >"${opencl_kernels}"
-cd -
-
-# get cl headers
-opencl_header_dir="../third_party/opencl/OpenCL-Headers"
-commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6"
-if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then
-    echo "pulling opencl headers"
-    cd $opencl_header_dir
-    git stash
-    git pull
-    git checkout $commit_id
-    cd -
-else
-    echo "cloning opencl headers"
-    rm -rf $opencl_header_dir
-    git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir
-    git checkout $commit_id
-fi
-
-build_for_android() {
-    # rm -rf "../build"
-    if [ -z "${NDK_ROOT}" ]; then
-        echo "NDK_ROOT not found!"
-        exit -1
-    fi
-
-    if [ -z "$PLATFORM" ]; then
-        PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
-        # PLATFORM="arm-v8a"
-    fi
-
-    if [ "${PLATFORM}" = "arm-v7a" ]; then
-        ABI="armeabi-v7a with NEON"
-        ARM_PLATFORM="V7"
-        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
-    elif [ "${PLATFORM}" = "arm-v8a" ]; then
-        ABI="arm64-v8a"
-        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold"
-    else
-        echo "unknown platform!"
-        exit -1
-    fi
-
-    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-19"
-    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
-    ANDROID_ARM_MODE="arm"
-
-    cmake .. \
-        -B"../buildreleasev7/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -DWITH_LOGGING=OFF \
-        -DWITH_PROFILE=OFF \
-        -DWITH_TEST=OFF \
-        -D"${ARM_PLATFORM}"=true
-
-    cd "../buildreleasev7/${PLATFORM}"
-    make -j 8
-}
-
-build_for_android
diff --git a/mobile/tools/build_android_armv8.sh b/mobile/tools/build_android_armv8.sh
deleted file mode 100755
index 3517227eaaf9cef4ce22fce9cfe1cbcd87d2a7a5..0000000000000000000000000000000000000000
--- a/mobile/tools/build_android_armv8.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/env bash
-
-# merge cl to so
-merge_cl_to_so=1
-opencl_kernels="opencl_kernels.cpp"
-cd ../src/operators/kernel/cl
-if [[ -f "${opencl_kernels}" ]]; then
-    rm "${opencl_kernels}"
-fi
-python gen_code.py "${merge_cl_to_so}" >"${opencl_kernels}"
-cd -
-
-# get cl headers
-opencl_header_dir="../third_party/opencl/OpenCL-Headers"
-commit_id="320d7189b3e0e7b6a8fc5c10334c79ef364b5ef6"
-if [[ -d "$opencl_header_dir" && -d "$opencl_header_dir/.git" ]]; then
-    echo "pulling opencl headers"
-    cd $opencl_header_dir
-    git stash
-    git pull
-    git checkout $commit_id
-    cd -
-else
-    echo "cloning opencl headers"
-    rm -rf $opencl_header_dir
-    git clone https://github.com/KhronosGroup/OpenCL-Headers $opencl_header_dir
-    git checkout $commit_id
-fi
-
-build_for_android() {
-    # rm -rf "../build"
-    if [ -z "${NDK_ROOT}" ]; then
-        echo "NDK_ROOT not found!"
-        exit -1
-    fi
-
-    if [ -z "$PLATFORM" ]; then
-        # PLATFORM="arm-v7a" # Users could choose "arm-v8a" platform.
-        PLATFORM="arm-v8a"
-    fi
-
-    if [ "${PLATFORM}" = "arm-v7a" ]; then
-        ABI="armeabi-v7a with NEON"
-        ARM_PLATFORM="V7"
-        CXX_FLAGS="-march=armv7-a -mfpu=neon -mfloat-abi=softfp -pie -fPIE -w -Wno-error=format-security"
-    elif [ "${PLATFORM}" = "arm-v8a" ]; then
-        ABI="arm64-v8a"
-        ARM_PLATFORM="V8"
-        CXX_FLAGS="-march=armv8-a  -pie -fPIE -w -Wno-error=format-security -llog -fuse-ld=gold"
-    else
-        echo "unknown platform!"
-        exit -1
-    fi
-
-    MODE="Release"
-    ANDROID_PLATFORM_VERSION="android-19"
-    TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake"
-    ANDROID_ARM_MODE="arm"
-
-    cmake .. \
-        -B"../buildreleasev8/${PLATFORM}" \
-        -DANDROID_ABI="${ABI}" \
-        -DCMAKE_BUILD_TYPE="${MODE}" \
-        -DCMAKE_TOOLCHAIN_FILE="${TOOLCHAIN_FILE}" \
-        -DANDROID_PLATFORM="${ANDROID_PLATFORM_VERSION}" \
-        -DCMAKE_CXX_FLAGS="${CXX_FLAGS}" \
-        -DANDROID_STL=c++_static \
-        -DANDROID=true \
-        -DWITH_LOGGING=OFF \
-        -DWITH_PROFILE=OFF \
-        -DWITH_TEST=OFF \
-        -D"${ARM_PLATFORM}"=true
-
-    cd "../buildreleasev8/${PLATFORM}"
-    make -j 8
-}
-
-build_for_android
diff --git a/mobile/tools/ci_build.sh b/mobile/tools/ci_build.sh
deleted file mode 100755
index 8bd892c22d26b3c7f4b4bccf60689abf5f42cc16..0000000000000000000000000000000000000000
--- a/mobile/tools/ci_build.sh
+++ /dev/null
@@ -1,270 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set -e
-source ./ci_run_test.sh
-
-function print_usage() {
-  echo "\n${RED}Usage${NONE}:
-  ${BOLD}${SCRIPT_NAME}${NONE} [Option] [Network]"
-
-  echo "\n${RED}Option${NONE}: required, specify the target platform
-  ${BLUE}android_armv7${NONE}: run build for android armv7 platform
-  ${BLUE}android_armv8${NONE}: run build for android armv8 platform
-  ${BLUE}ios${NONE}: run build for apple ios platform
-  ${BLUE}linux_armv7${NONE}: run build for linux armv7 platform
-  ${BLUE}linux_armv8${NONE}: run build for linux armv8 platform
-  ${BLUE}fpga${NONE}: run build for fpga platform
-  "
-  echo "\n${RED}Network${NONE}: optional, for deep compressing the framework size
-  ${BLUE}googlenet${NONE}: build only googlenet support
-  ${BLUE}mobilenet${NONE}: build only mobilenet support
-  ${BLUE}yolo${NONE}: build only yolo support
-  ${BLUE}squeezenet${NONE}: build only squeezenet support
-  ${BLUE}resnet${NONE}: build only resnet support
-  ${BLUE}mobilenetssd${NONE}: build only mobilenetssd support
-  ${BLUE}nlp${NONE}: build only nlp model support
-  ${BLUE}mobilenetfssd${NONE}: build only mobilenetfssd support
-  ${BLUE}genet${NONE}: build only genet support
-  ${BLUE}super${NONE}: build only super support
-  "
-}
-
-function init() {
-  RED='\033[0;31m'
-  BLUE='\033[0;34m'
-  BOLD='\033[1m'
-  NONE='\033[0m'
-
-  PADDLE_MOBILE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )"
-  if [ -z "${SCRIPT_NAME}" ]; then
-      SCRIPT_NAME=$0
-  fi
-}
-
-function check_ndk() {
-  if [ -z "${NDK_ROOT}" ]; then
-    echo "Should set NDK_ROOT as your android ndk path, such as\n"
-    echo "  export NDK_ROOT=~/android-ndk-r14b\n"
-    exit -1
-  fi
-}
-
-function build_android_armv7_cpu_only() {
-#  rm -rf ../build/armeabi-v7a
-  cmake .. \
-    -B"../build/armeabi-v7a" \
-    -DANDROID_ABI="armeabi-v7a with NEON" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/armeabi-v7a && make -j 8
-  cd -
-}
-
-function build_android_armv7_gpu() {
-  rm -rf ../build/armeabi-v7a
-  cmake .. \
-    -B"../build/armeabi-v7a" \
-    -DANDROID_ABI="armeabi-v7a with NEON" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/armeabi-v7a && make -j 8
-  cd -
-}
-
-function build_android_armv8_cpu_only() {
-  rm -rf ../build/arm64-v8a
-  cmake .. \
-    -B"../build/arm64-v8a" \
-    -DANDROID_ABI="arm64-v8a" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/arm64-v8a && make -j 1
-  cd -
-}
-
-function build_android_armv8_gpu() {
-  rm -rf ../build/arm64-v8a
-  cmake .. \
-    -B"../build/arm64-v8a" \
-    -DANDROID_ABI="arm64-v8a" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/android-cmake/android.toolchain.cmake" \
-    -DANDROID_PLATFORM="android-22" \
-    -DANDROID_STL=c++_static \
-    -DANDROID=true \
-    -DWITH_LOGGING=OFF \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/arm64-v8a && make -j 8
-  cd -
-}
-
-function build_ios_armv8_cpu_only() {
-  rm -rf ../build/ios
-  cmake .. \
-    -B"../build/ios" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \
-    -DIOS_PLATFORM=OS \
-    -DIOS_ARCH="${IOS_ARCH}" \
-    -DIS_IOS=true \
-    -DUSE_OPENMP=OFF \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/ios && make -j 8
-  cd -
-}
-
-function build_ios_armv8_gpu() {
-  rm -rf ../build/ios
-  cmake .. \
-    -B"../build/ios" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/ios-cmake/ios.toolchain.cmake" \
-    -DIOS_PLATFORM=OS \
-    -DIOS_ARCH="${IOS_ARCH}" \
-    -DIS_IOS=true \
-    -DUSE_OPENMP=OFF \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/ios && make -j 8
-  cd -
-}
-
-function build_linux_armv7_cpu_only() {
-  rm -rf ../build/armv7_linux
-  cmake .. \
-    -B"../build/armv7_linux" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-    -DCPU=ON \
-    -DGPU_CL=OFF \
-    -DFPGA=OFF
-
-  cd ../build/armv7_linux && make -j 8
-  cd -
-}
-
-function build_linux_armv7_gpu() {
-  rm -rf ../build/armv7_linux
-  cmake .. \
-    -B"../build/armv7_linux" \
-    -DCMAKE_BUILD_TYPE="MinSizeRel" \
-    -DCMAKE_TOOLCHAIN_FILE="./tools/toolchains/arm-linux-gnueabihf.cmake" \
-    -DCPU=ON \
-    -DGPU_CL=ON \
-    -DFPGA=OFF
-
-  cd ../build/armv7_linux && make -j 8
-  cd -
-}
-
-function build_android_armv7() {
-  check_ndk
-  build_android_armv7_cpu_only
-  # build_android_armv7_gpu
-}
-
-function build_android_armv8() {
-  check_ndk
-  build_android_armv8_cpu_only
-  # build_android_armv8_gpu
-}
-
-function build_ios() {
-  build_ios_armv8_cpu_only
-  # build_ios_armv8_gpu
-}
-
-function build_linux_armv7() {
-  build_linux_armv7_cpu_only
-  # build_linux_armv7_gpu
-}
-
-function build_linux_fpga() {
-  cd ..
-  image=`docker images paddle-mobile:dev | grep 'paddle-mobile'`
-  if [[ "x"$image == "x" ]]; then
-    docker build -t paddle-mobile:dev - < Dockerfile
-  fi
-  docker run --rm -v `pwd`:/workspace paddle-mobile:dev bash /workspace/tools/docker_build_fpga.sh
-  cd -
-}
-
-function run_android_test() {
-  ExecuteAndroidTests $1
-}
-
-function main() {
-  local CMD=$1
-  init
-  case $CMD in
-    android_armv7)
-      build_android_armv7
-      run_android_test armeabi-v7a 
-      ;;
-    android_armv8)
-      build_android_armv8
-      run_android_test arm64-v8a
-      ;;
-    ios)
-      build_ios
-      ;;
-    linux_armv7)
-      build_linux_armv7
-      ;;
-    fpga)
-      build_linux_fpga
-      ;;
-    *)
-      print_usage
-      exit 0
-      ;;
-    esac
-}
-
-main $@
diff --git a/mobile/tools/ci_run_test.sh b/mobile/tools/ci_run_test.sh
deleted file mode 100644
index 6470a97b15a4497cf933ff0a22befa34383dd890..0000000000000000000000000000000000000000
--- a/mobile/tools/ci_run_test.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env bash
-
-operators=
-
-function AddTest() {
-  operators="${operators} $1"
-}
-
-function ExecuteAndroidTests() {
-  platform=$1
-  devices=`adb devices | grep -v devices | grep device | awk -F ' ' '{print $1}'`
-  for device in ${devices}; do
-    adb -s ${device} shell rm -rf /data/local/tmp/*
-    adb -s ${device} push ../build/${platform}/build/libpaddle-mobile.so /data/local/tmp/
-    for op in ${operators}; do
-      adb -s ${device} push ../test/build/test-${op}-op /data/local/tmp/
-      adb -s ${device} shell "cd /data/local/tmp/; LD_LIBRARY_PATH=. ./test-${op}-op"
-      echo "${BLUE}run test ${op} pass${NONE}"
-    done
-  done
-}
-
-AddTest batchnorm
-AddTest cast
-AddTest conv
-AddTest dequantize
-#AddTest elementwiseadd
-AddTest log
-AddTest logical-and
-AddTest logical-not
-AddTest logical-or
-AddTest logical-xor
-AddTest pool
-AddTest quantize
-AddTest relu
-AddTest relu6
-AddTest sequence-expand
-AddTest sequence-pool
-AddTest sequence-softmax
-AddTest sigmoid
-AddTest softmax
-AddTest tanh
-AddTest topk
diff --git a/mobile/tools/docker_build_fpga.sh b/mobile/tools/docker_build_fpga.sh
deleted file mode 100644
index 9ca9406f438a832c4f24739033db8522a0d16411..0000000000000000000000000000000000000000
--- a/mobile/tools/docker_build_fpga.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/usr/bin/env bash
-
-apt-get update
-apt-get install -y gcc g++ cmake
-
-cd /workspace && mkdir build
-cd build && cmake .. -DCPU=OFF -DGPU_CL=OFF -DFPGA=ON && make -j4
diff --git a/mobile/tools/ios-cmake/ios.toolchain.cmake b/mobile/tools/ios-cmake/ios.toolchain.cmake
deleted file mode 100644
index 12dd1721d488cd8ba776b8f302f137ad2d60fe73..0000000000000000000000000000000000000000
--- a/mobile/tools/ios-cmake/ios.toolchain.cmake
+++ /dev/null
@@ -1,216 +0,0 @@
-# This file is based off of the Platform/Darwin.cmake and Platform/UnixPaths.cmake
-# files which are included with CMake 2.8.4
-# It has been altered for iOS development
-
-# Options:
-#
-# IOS_PLATFORM = OS (default) or SIMULATOR or SIMULATOR64
-#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
-#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
-#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
-#
-# CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
-#   If set manually, it will override the default location and force the user of a particular Developer Platform
-#
-# CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
-#   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
-#   If set manually, this will force the use of a specific SDK version
-
-# Macros:
-#
-# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
-#  A convenience macro for setting xcode specific properties on targets
-#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
-#
-# find_host_package (PROGRAM ARGS)
-#  A macro used to find executable programs on the host system, not within the iOS environment.
-#  Thanks to the android-cmake project for providing the command
-
-# Standard settings
-set (CMAKE_SYSTEM_NAME Darwin)
-set (CMAKE_SYSTEM_VERSION 1)
-set (UNIX True)
-set (APPLE True)
-set (IOS True)
-set (IOS_ARCH armv7 armv7s arm64)
-
-# Required as of cmake 2.8.10
-set (CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
-
-# Determine the cmake host system version so we know where to find the iOS SDKs
-find_program (CMAKE_UNAME uname /bin /usr/bin /usr/local/bin)
-if (CMAKE_UNAME)
-  exec_program(uname ARGS -r OUTPUT_VARIABLE CMAKE_HOST_SYSTEM_VERSION)
-  string (REGEX REPLACE "^([0-9]+)\\.([0-9]+).*$" "\\1" DARWIN_MAJOR_VERSION "${CMAKE_HOST_SYSTEM_VERSION}")
-endif (CMAKE_UNAME)
-
-# Force the compilers to gcc for iOS
-#include (CMakeForceCompiler)
-#CMAKE_C_COMPILER (/usr/bin/gcc)
-#CMAKE_CXX_COMPILER (/usr/bin/g++)
-if(USE_OPENMP)
-    set(CMAKE_C_COMPILER /usr/local/opt/llvm/bin/clang)
-    set(CMAKE_CXX_COMPILER /usr/local/opt/llvm/bin/clang++)
-else()
-    set(CMAKE_C_COMPILER /usr/bin/gcc)
-    set(CMAKE_CXX_COMPILER /usr/bin/g++)
-endif()
-set(CMAKE_AR ar CACHE FILEPATH "" FORCE)
-
-# Skip the platform compiler checks for cross compiling
-set (CMAKE_CXX_COMPILER_WORKS TRUE)
-set (CMAKE_C_COMPILER_WORKS TRUE)
-
-# All iOS/Darwin specific settings - some may be redundant
-set (CMAKE_SHARED_LIBRARY_PREFIX "lib")
-set (CMAKE_SHARED_LIBRARY_SUFFIX ".dylib")
-set (CMAKE_SHARED_MODULE_PREFIX "lib")
-set (CMAKE_SHARED_MODULE_SUFFIX ".so")
-set (CMAKE_MODULE_EXISTS 1)
-set (CMAKE_DL_LIBS "")
-
-set (CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
-set (CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
-set (CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
-set (CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
-
-# Hidden visibilty is required for cxx on iOS
-set (CMAKE_C_FLAGS_INIT "")
-set (CMAKE_CXX_FLAGS_INIT "-fvisibility=hidden -fvisibility-inlines-hidden")
-
-set (CMAKE_C_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_C_LINK_FLAGS}")
-set (CMAKE_CXX_LINK_FLAGS "-Wl,-search_paths_first ${CMAKE_CXX_LINK_FLAGS}")
-
-set (CMAKE_PLATFORM_HAS_INSTALLNAME 1)
-set (CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
-set (CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
-set (CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
-set (CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
-set (CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
-
-# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
-# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
-# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
-# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
-if (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
-endif (NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
-
-# Setup iOS platform unless specified manually with IOS_PLATFORM
-if (NOT DEFINED IOS_PLATFORM)
-  set (IOS_PLATFORM "OS")
-endif (NOT DEFINED IOS_PLATFORM)
-set (IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
-
-# Setup building for arm64 or not
-if (NOT DEFINED BUILD_ARM64)
-  set (BUILD_ARM64 true)
-endif (NOT DEFINED BUILD_ARM64)
-set (BUILD_ARM64 ${BUILD_ARM64} CACHE STRING "Build arm64 arch or not")
-
-# Check the platform selection and setup for developer root
-if (${IOS_PLATFORM} STREQUAL "OS")
-  set (IOS_PLATFORM_LOCATION "iPhoneOS.platform")
-
-  # This causes the installers to properly locate the output libraries
-  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set (SIMULATOR true)
-  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-
-  # This causes the installers to properly locate the output libraries
-  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
-  set (SIMULATOR true)
-  set (IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
-
-  # This causes the installers to properly locate the output libraries
-  set (CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
-else (${IOS_PLATFORM} STREQUAL "OS")
-  message (FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please choose OS or SIMULATOR")
-endif (${IOS_PLATFORM} STREQUAL "OS")
-
-# Setup iOS developer location unless specified manually with CMAKE_IOS_DEVELOPER_ROOT
-# Note Xcode 4.3 changed the installation location, choose the most recent one available
-exec_program(/usr/bin/xcode-select ARGS -print-path OUTPUT_VARIABLE CMAKE_XCODE_DEVELOPER_DIR)
-set (XCODE_POST_43_ROOT "${CMAKE_XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-set (XCODE_PRE_43_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
-if (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
-  if (EXISTS ${XCODE_POST_43_ROOT})
-    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_POST_43_ROOT})
-  elseif(EXISTS ${XCODE_PRE_43_ROOT})
-    set (CMAKE_IOS_DEVELOPER_ROOT ${XCODE_PRE_43_ROOT})
-  endif (EXISTS ${XCODE_POST_43_ROOT})
-endif (NOT DEFINED CMAKE_IOS_DEVELOPER_ROOT)
-set (CMAKE_IOS_DEVELOPER_ROOT ${CMAKE_IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
-
-set(CMAKE_IOS_SDK_ROOT "/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk")
-# Find and use the most recent iOS sdk unless specified manually with CMAKE_IOS_SDK_ROOT
-if (NOT DEFINED CMAKE_IOS_SDK_ROOT)
-  file (GLOB _CMAKE_IOS_SDKS "${CMAKE_IOS_DEVELOPER_ROOT}/SDKs/*")
-  if (_CMAKE_IOS_SDKS)
-    list (SORT _CMAKE_IOS_SDKS)
-    list (REVERSE _CMAKE_IOS_SDKS)
-    list (GET _CMAKE_IOS_SDKS 0 CMAKE_IOS_SDK_ROOT)
-  else (_CMAKE_IOS_SDKS)
-    message (FATAL_ERROR "No iOS SDK's found in default search path ${CMAKE_IOS_DEVELOPER_ROOT}. Manually set CMAKE_IOS_SDK_ROOT or install the iOS SDK.")
-  endif (_CMAKE_IOS_SDKS)
-  message (STATUS "Toolchain using default iOS SDK: ${CMAKE_IOS_SDK_ROOT}")
-endif (NOT DEFINED CMAKE_IOS_SDK_ROOT)
-set (CMAKE_IOS_SDK_ROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
-
-# Set the sysroot default to the most recent SDK
-set (CMAKE_OSX_SYSROOT ${CMAKE_IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
-
-# set the architecture for iOS
-if (${IOS_PLATFORM} STREQUAL "OS")
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR")
-  set (IOS_ARCH i386)
-elseif (${IOS_PLATFORM} STREQUAL "SIMULATOR64")
-  set (IOS_ARCH x86_64)
-endif (${IOS_PLATFORM} STREQUAL "OS")
-
-set (CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
-
-# Set the find root to the iOS developer roots and to user defined paths
-set (CMAKE_FIND_ROOT_PATH ${CMAKE_IOS_DEVELOPER_ROOT} ${CMAKE_IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH} CACHE string  "iOS find search path root")
-
-# default to searching for frameworks first
-set (CMAKE_FIND_FRAMEWORK FIRST)
-
-# set up the default search directories for frameworks
-set (CMAKE_SYSTEM_FRAMEWORK_PATH
-        ${CMAKE_IOS_SDK_ROOT}/System/Library/Frameworks
-        ${CMAKE_IOS_SDK_ROOT}/System/Library/PrivateFrameworks
-        ${CMAKE_IOS_SDK_ROOT}/Developer/Library/Frameworks
-        )
-
-# only search the iOS sdks, not the remainder of the host filesystem
-set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-
-# This little macro lets you set any XCode specific property
-macro (set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
-  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
-endmacro (set_xcode_property)
-
-
-# This macro lets you find executable programs on the host system
-macro (find_host_package)
-  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
-  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
-  set (IOS FALSE)
-
-  find_package(${ARGN})
-
-  set (IOS TRUE)
-  set (CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
-  set (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-  set (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-endmacro (find_host_package)
-
diff --git a/mobile/tools/net-detail.awk b/mobile/tools/net-detail.awk
deleted file mode 100644
index 84d0166ac777b5b7fbd9801665031bb2d51fedbb..0000000000000000000000000000000000000000
--- a/mobile/tools/net-detail.awk
+++ /dev/null
@@ -1,91 +0,0 @@
-BEGIN {
-print "digraph G {"
-}
-/op:/ {
-    id++
-    opname[id] = $NF
-}
-/input/ {
-    type = "input"
-    para = $NF
-    if (input[id]) {
-        input[id] = input[id] "|"
-    }
-    input[id] = input[id] "<" para ">" para
-}
-/output/ {
-    type = "output"
-    para = $NF
-    if (output[id]) {
-        output[id] = output[id] "|"
-    }
-    output[id] = output[id] "<" para ">" para
-}
-/attr/ {
-    type = "attr"
-    aname = $NF
-    if (attr_key[id]) {
-        attr_key[id] = attr_key[id] "|"
-        attr_value[id] = attr_value[id] "|"
-    }
-    attr_key[id] = attr_key[id] $NF
-}
-/argument/ {
-    if (type == "attr") {
-        split($0, arr, " - ")
-        attr_value[id] = attr_value[id] arr[2]
-    } else if ((type == "input") || (type == "output")) {
-        if (!var2id[$NF]) {
-            var_id++
-            var[var_id] = $NF
-            var2id[$NF] = var_id
-        }
-        varid = var2id[$NF]
-        lid++
-        if (type == "input") {
-            line[lid] = "var_" varid " -> " "op_" id ":<" para ">"
-            if (xout[$NF]) {
-                xi++
-                xline[xi] = "xop_" xout[$NF] " -> " "xop_" id
-            }
-        } else if (type == "output") {
-            line[lid] = "op_" id ":<" para ">" " -> " "var_" varid
-            xout[$NF] = id
-        }
-    }
-}
-/var name/ {
-    varname = $NF
-    vid = var2id[varname]
-}
-/var tensor desc dim / {
-    if (tensor[vid]) tensor[vid] = tensor[vid] " x "
-    tensor[vid] = tensor[vid] $NF
-}
-END {
-
-print "subgraph cluster_G0 {"
-for (i = 1; i <= id; i++) {
-    print "xop_" i "[label=\"" i ". " opname[i] "\"]"
-}
-for (i = 1; i <= xi; i++) {
-    print xline[i]
-}
-print "}"
-
-for (i = 1; i <= id; i++) {
-print "op_" i "[group=op;shape=record;label=\"{{" input[i] "}|<op>" i ". " opname[i] "|{" output[i] "}}\"]"
-}
-for (i = 1; i <= var_id; i++) {
-print "var_" i "[label=\"" var[i] " [" tensor[i] "]\"]"
-}
-for (i = 1; i <= lid; i++) {
-print line[i]
-}
-for (i = 1; i <= id; i++) {
-print "attr_" i "[shape=record;label=\"{" attr_key[i] "}|{" attr_value[i] "}\"]"
-print "attr_" i " -> " "op_" i ":<op>"
-}
-print "}"
-}
-
diff --git a/mobile/tools/net.awk b/mobile/tools/net.awk
deleted file mode 100644
index 25689c90d871618fc445bba5044446fa7198b2c5..0000000000000000000000000000000000000000
--- a/mobile/tools/net.awk
+++ /dev/null
@@ -1,27 +0,0 @@
-BEGIN {
-    print "digraph {"
-}
-/op:/ {
-    id++
-    op = $NF
-    opname = op "_" id
-    print opname "[\"label\"=\"" op " [" id "]" "\"]"
-}
-/input/ {
-    type = "input"
-}
-/output/ {
-    type = "output"
-}
-/argument/ {
-    if (type == "output") {
-        output[$NF] = opname
-    } else if (type == "input") {
-        if (output[$NF]) {
-            print output[$NF] " -> " opname
-        }
-    }
-}
-END {
-    print "}"
-}
diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake
deleted file mode 100755
index 44f2bc0f088950ede560766a8fd130214200e780..0000000000000000000000000000000000000000
--- a/mobile/tools/op.cmake
+++ /dev/null
@@ -1,770 +0,0 @@
-set(FOUND_MATCH OFF)
-set(CON -1)
-
-message(STATUS "nets :${NET}")
-
-list(FIND NET "googlenet" CON)
-if (CON GREATER -1)
-  message("googlenet enabled")
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(LRN_OP ON)
-  set(MUL_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(RELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "mobilenet" CON)
-if (CON GREATER -1)
-  message("mobilenet enabled")
-  set(CONV_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(RELU_OP ON)
-  set(SOFTMAX_OP ON)
-  set(MUL_OP ON)
-  set(DEPTHWISECONV_OP ON)
-  set(BATCHNORM_OP ON)
-  set(POOL_OP ON)
-  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-
-list(FIND NET "mobilenetssd" CON)
-if (CON GREATER -1)
-  message("mobilenetssd enabled")
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_DWCONVBNRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(MULTICLASSNMS_OP ON)
-  set(SOFTMAX_OP ON)
-  set(TRANSPOSE_OP ON)
-    #feed
-  set(PRIORBOX_OP ON)
-  set(CONCAT_OP ON)
-  set(BOXCODER_OP ON)
-  set(RESHAPE_OP ON)
-#fetch
-  #total
-
-  set(FOUND_MATCH ON)
-
-endif()
-
-
-list(FIND NET "yolo" CON)
-if (CON GREATER -1)
-  message("yolo enabled")
-  set(BATCHNORM_OP ON)
-  set(CONV_OP ON)
-  set(RELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "squeezenet" CON)
-if (CON GREATER -1)
-  message("squeezenet enabled")
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(RELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(POOL_OP ON)
-  set(RESHAPE_OP ON)
-  set(SOFTMAX_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-
-list(FIND NET "resnet" CON)
-if (CON GREATER -1)
-  message("resnet enabled")
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(RELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(POOL_OP ON)
-  set(BATCHNORM_OP ON)
-  set(FUSION_CONVBNADDRELU_OP ON)
-  set(MUL_OP ON)
-  set(RESHAPE_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "FPGA_NET_V1" CON)
-if (CON GREATER -1)
-  message("FPGA_NET_V1 enabled")
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_ELEMENTWISEADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBN_OP ON)
-  set(TANH_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SPLIT_OP ON)
-  set(FUSION_DECONVADD_OP ON)
-  set(FUSION_DECONVADDRELU_OP ON)
-
-  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDBN_OP ON)
-  set(RESHAPE2_OP ON)
-  set(PSROI_POOL_OP ON)
-  set(ROIALIGN_POOL_OP ON)
-  set(PROPOSAL_OP ON)
-  set(ANCHOR_GENERATOR_OP ON)
-  set(SLICE_OP ON)
-  set(SIGMOID_OP ON)
-  set(CONCAT_OP ON)
-  set(PAD2D_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_DECONVADDBNRELU_OP ON)
-  set(FUSION_DECONVADDBN_OP ON)
-  set(FUSION_DECONVBNRELU_OP ON)
-  set(CONV_OP ON)
-  set(ELEMENTWISEMUL_OP ON)
-  set(FUSION_FCRELU_OP ON)
-  set(RELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "FPGA_NET_V2" CON)
-if (CON GREATER -1)
-  message("FPGA_NET_V2 enabled")
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_ELEMENTWISEADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBN_OP ON)
-  set(TANH_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SPLIT_OP ON)
-  set(FUSION_DECONVADD_OP ON)
-  set(FUSION_DECONVADDRELU_OP ON)
-
-  set(RESHAPE_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDBN_OP ON)
-  set(RESHAPE2_OP ON)
-  set(PSROI_POOL_OP ON)
-  set(ROIALIGN_POOL_OP ON)
-  set(PROPOSAL_OP ON)
-  set(ANCHOR_GENERATOR_OP ON)
-  set(SLICE_OP ON)
-  set(SIGMOID_OP ON)
-  set(CONCAT_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_DECONVADDBNRELU_OP ON)
-  set(FUSION_DECONVADDBN_OP ON)
-  set(FUSION_DECONVBNRELU_OP ON)
-  set(CONV_OP ON)
-  set(ELEMENTWISEMUL_OP ON)
-  set(FUSION_FCRELU_OP ON)
-  set(RELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "FPGA_OPS_KD" CON)
-if (CON GREATER -1)
-  message("FPGA_OPS_KD enabled")
-  set(CONV_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_ELEMENTWISEADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(POOL_OP ON)
-  set(SOFTMAX_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVBN_OP ON)
-  set(TANH_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SPLIT_OP ON)
-  set(FUSION_DECONVADD_OP ON)
-  set(FUSION_DECONVADDRELU_OP ON)
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "nlp" CON)
-if (CON GREATER -1)
-  message("nlp enabled")
-  set(FUSION_FC_OP ON)
-  set(LOOKUP_OP ON)
-  set(GRU_OP ON)
-  set(CRF_OP ON)
-  set(CONCAT_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "mobilenetfssd" CON)
-if (CON GREATER -1)
-  message("mobilenetfssd enabled")
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(SOFTMAX_OP ON)
-  set(RESHAPE_OP ON)
-  set(BILINEAR_INTERP_OP ON)
-  set(TRANSPOSE_OP ON)
-  set(CONCAT_OP ON)
-  set(PRIORBOX_OP ON)
-  set(BATCHNORM_OP ON)
-  set(BOXCODER_OP ON)
-  set(MULTICLASSNMS_OP ON)
-  set(FLATTEN_OP ON)
-  set(FLATTEN2_OP ON)
-  set(SPLIT_OP ON)
-  set(SHAPE_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "genet" CON)
-if (CON GREATER -1)
-  message("genet enabled")
-  set(FUSION_CONVADDPRELU_OP ON)
-  set(FUSION_CONVADDADDPRELU_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(PRELU_OP ON)
-  set(POOL_OP ON)
-  set(CONCAT_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "super" CON)
-if (CON GREATER -1)
-  message("super enabled")
-  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-
-  set(FOUND_MATCH ON)
-endif()
-
-list(FIND NET "op" CON)
-if (CON GREATER -1)
-  message("op enabled")
-#  set(SIGMOID_OP ON)
-#  set(LEAKY_RELU_OP ON)
-  set(BLOG ON)
-  set(FOUND_MATCH ON)
-endif()
-
-if(NOT FOUND_MATCH)
-  message("--default--")
-  set(NORM_OP ON)
-  set(BATCHNORM_OP ON)
-  set(INSTANCENORM_OP ON)
-  set(CONV_TRANSPOSE_OP ON)
-  set(BOXCODER_OP ON)
-  set(CONCAT_OP ON)
-  set(CONV_OP ON)
-  set(DEPTHWISECONV_OP ON)
-  set(ELEMENTWISEADD_OP ON)
-  set(ELEMENTWISESUB_OP ON)
-  set(IM2SEQUENCE_OP ON)
-  set(FILL_CONSTANT_OP ON)
-  set(DENSITY_PRIORBOX_OP ON)
-  set(FUSION_CONVADD_OP ON)
-  set(FUSION_CONVADDPRELU_OP ON)
-  set(EXP_OP ON)
-  set(FUSION_CONVADDRELU_OP ON)
-  set(FUSION_FC_OP ON)
-  set(LRN_OP ON)
-  set(MUL_OP ON)
-  set(MULTICLASSNMS_OP ON)
-  set(POLYGONBOXTRANSFORM_OP ON)
-  set(POOL_OP ON)
-  set(PRIORBOX_OP ON)
-  set(RELU_OP ON)
-  set(RESHAPE_OP ON)
-  set(RESHAPE2_OP ON)
-  set(SIGMOID_OP ON)
-  set(SOFTMAX_OP ON)
-  set(TRANSPOSE_OP ON)
-  set(TRANSPOSE2_OP ON)
-  set(FUSION_CONVADDBNRELU_OP ON)
-  set(FUSION_CONVADDADDPRELU_OP ON)
-  set(FUSION_DWCONVBNRELU_OP ON)
-  set(FUSION_CONVBNRELU_OP ON)
-  set(FUSION_CONVRELU_OP ON)
-  set(FUSION_CONVBNADDRELU_OP ON)
-  set(PRELU_OP ON)
-  set(RESIZE_OP ON)
-  set(SCALE_OP ON)
-  set(SLICE_OP ON)
-  set(DROPOUT_OP ON)
-  set(IM2SEQUENCE_OP ON)
-  set(LOOKUP_OP ON)
-  set(GRU_OP ON)
-  set(GRU_UNIT_OP ON)
-  set(CRF_OP ON)
-  set(BILINEAR_INTERP_OP ON)
-  set(SPLIT_OP ON)
-  set(FLATTEN_OP ON)
-  set(FLATTEN2_OP ON)
-  set(SHAPE_OP ON)
-  set(ELEMENTWISEMUL_OP ON)
-  set(SUM_OP ON)
-  set(TOP_K_OP ON)
-  set(CAST_OP ON)
-  set(QUANT_OP ON)
-  set(DEQUANT_OP ON)
-  set(FUSION_DEQUANT_BN_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_OP ON)
-  set(FUSION_DEQUANT_BN_RELU_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_RELU_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_QUANT_OP ON)
-  set(FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP ON)
-  set(SEQUENCE_EXPAND_OP ON)
-  set(SEQUENCE_POOL_OP ON)
-  set(SEQUENCE_SOFTMAX_OP ON)
-  set(LOG_OP ON)
-  set(TANH_OP ON)
-  set(LOD_RESET_OP ON)
-  set(LESS_THAN_OP ON)
-  set(LOGICAL_AND_OP ON)
-  set(LOGICAL_OR_OP ON)
-  set(LOGICAL_NOT_OP ON)
-  set(LOGICAL_XOR_OP ON)
-  set(WHILE_OP ON)
-  set(WRITE_TO_ARRAY_OP ON)
-  set(READ_FROM_ARRAY_OP ON)
-  set(IS_EMPTY_OP ON)
-  set(INCREMENT_OP ON)
-  set(ANCHOR_GENERATOR_OP ON)
-  set(PROPOSAL_OP ON)
-  set(PSROI_POOL_OP ON)
-  set(ROI_PERSPECTIVE_OP ON)
-  set(BEAM_SEARCH_OP ON)
-  set(BEAM_SEARCH_DECODE_OP ON)
-  set(PAD2D_OP ON)
-  set(ONE_HOT_OP ON)
-  set(ASSIGN_VALUE_OP ON)
-  set(NEAREST_INTERP_OP ON)
-  set(LEAKY_RELU_OP ON)
-  set(ASSIGN_OP ON)
-  set(CONDITIONAL_BLOCK_OP ON)
-  set(EQUAL_OP ON)
-  set(FILL_CONSTANT_BATCH_SIZE_LIKE_OP ON)
-  set(RANGE_OP ON)
-  set(REDUCE_PROD_OP ON)
-  set(FUSION_INSTANCENORM_RELU_OP ON)
-  set(PIXEL_SHUFFLE_OP ON)
-  set(EXPAND_OP ON)
-  set(GRID_SAMPLER_OP ON)
-endif()
-
-  # option(BATCHNORM_OP "" ON)
-  # option(BOXCODER_OP "" ON)
-  # option(CONCAT_OP "" ON)
-  # option(CONV_OP "" ON)
-  # option(DEPTHWISECONV_OP "" ON)
-  # option(ELEMENTWISEADD_OP "" ON)
-  # option(FILL_CONSTANT_OP "" ON)
-  # option(FUSION_CONVADD_OP "" ON)
-  # option(FUSION_CONVADDRELU_OP "" ON)
-  # option(FUSION_FC_OP "" ON)
-  # option(LRN_OP "" ON)
-  # option(MUL_OP "" ON)
-  # option(MULTICLASSNMS_OP "" ON)
-  # option(POLYGONBOXTRANSFORM_OP "" ON)
-  # option(POOL_OP "" ON)
-  # option(PRIORBOX_OP "" ON)
-  # option(RELU_OP "" ON)
-  # option(RESHAPE_OP "" ON)
-  # option(RESHAPE2_OP "" ON)
-  # option(SIGMOID_OP "" ON)
-  # option(SOFTMAX_OP "" ON)
-  # option(TRANSPOSE_OP "" ON)
-  # option(TRANSPOSE2_OP "" ON)
-# endif ()
-
-if (NORM_OP)
-  add_definitions(-DNORM_OP)
-endif()
-if (BATCHNORM_OP)
-  add_definitions(-DBATCHNORM_OP)
-endif()
-if (INSTANCENORM_OP)
-  add_definitions(-DINSTANCENORM_OP)
-endif()
-if (FUSION_INSTANCENORM_RELU_OP)
-  add_definitions(-DFUSION_INSTANCENORM_RELU_OP)
-endif()
-if (BOXCODER_OP)
-  add_definitions(-DBOXCODER_OP)
-endif()
-if (CONCAT_OP)
-  add_definitions(-DCONCAT_OP)
-endif()
-if (CONV_OP)
-  add_definitions(-DCONV_OP)
-endif()
-if (DEPTHWISECONV_OP)
-  add_definitions(-DDEPTHWISECONV_OP)
-endif()
-if (ELEMENTWISEADD_OP)
-  add_definitions(-DELEMENTWISEADD_OP)
-endif()
-if (ELEMENTWISESUB_OP)
-  add_definitions(-DELEMENTWISESUB_OP)
-endif()
-if (FILL_CONSTANT_OP)
-  add_definitions(-DFILL_CONSTANT_OP)
-endif()
-# if (FUSION_CONVADD_OP)
-#   add_definitions(-DFUSION_CONVADD_OP)
-# endif()
-if (FUSION_CONVADDRELU_OP)
-  add_definitions(-DFUSION_CONVADDRELU_OP)
-endif()
-if (FUSION_CONVADDPRELU_OP)
-  add_definitions(-DFUSION_CONVADDPRELU_OP)
-endif()
-if (FUSION_CONVADDADDPRELU_OP)
-  add_definitions(-DFUSION_CONVADDADDPRELU_OP)
-endif()
-if (FUSION_FC_OP)
-  add_definitions(-DFUSION_FC_OP)
-endif()
-if (LRN_OP)
-  add_definitions(-DLRN_OP)
-endif()
-if (MUL_OP)
-  add_definitions(-DMUL_OP)
-endif()
-if (MULTICLASSNMS_OP)
-  add_definitions(-DMULTICLASSNMS_OP)
-endif()
-if (POLYGONBOXTRANSFORM_OP)
-  add_definitions(-DPOLYGONBOXTRANSFORM_OP)
-endif()
-if (POOL_OP)
-  add_definitions(-DPOOL_OP)
-endif()
-if (PRIORBOX_OP)
-  add_definitions(-DPRIORBOX_OP)
-endif()
-if (RELU_OP)
-  add_definitions(-DRELU_OP)
-endif()
-if (RESHAPE_OP)
-  add_definitions(-DRESHAPE_OP)
-endif()
-if (RESHAPE2_OP)
-  add_definitions(-DRESHAPE2_OP)
-endif()
-if (SIGMOID_OP)
-  add_definitions(-DSIGMOID_OP)
-endif()
-if (SOFTMAX_OP)
-  add_definitions(-DSOFTMAX_OP)
-endif()
-if (TRANSPOSE_OP)
-  add_definitions(-DTRANSPOSE_OP)
-endif()
-if (TRANSPOSE2_OP)
-  add_definitions(-DTRANSPOSE2_OP)
-endif()
-if (FUSION_CONVADDBNRELU_OP)
-  add_definitions(-DFUSION_CONVADDBNRELU_OP)
-endif()
-if (FUSION_DWCONVBNRELU_OP)
-  add_definitions(-DFUSION_DWCONVBNRELU_OP)
-endif()
-
-if (FUSION_CONVBNRELU_OP)
-  add_definitions(-DFUSION_CONVBNRELU_OP)
-endif()
-
-if (FUSION_CONVRELU_OP)
-  add_definitions(-DFUSION_CONVRELU_OP)
-endif()
-
-if (FUSION_CONVBNADDRELU_OP)
-  add_definitions(-DFUSION_CONVBNADDRELU_OP)
-endif()
-
-if (PRELU_OP)
-  add_definitions(-DPRELU_OP)
-endif()
-if (RESIZE_OP)
-  add_definitions(-DRESIZE_OP)
-endif()
-if (SCALE_OP)
-  add_definitions(-DSCALE_OP)
-endif()
-if (SLICE_OP)
-  add_definitions(-DSLICE_OP)
-endif()
-if (DROPOUT_OP)
-  add_definitions(-DDROPOUT_OP)
-endif()
-if (IM2SEQUENCE_OP)
-  add_definitions(-DIM2SEQUENCE_OP)
-endif()
-
-if (FUSION_CONVADDBN_OP)
-  add_definitions(-DFUSION_CONVADDBN_OP)
-endif()
-if (FUSION_FCRELU_OP)
-  add_definitions(-DFUSION_FCRELU_OP)
-endif()
-if (FUSION_POOLBN_OP)
-  add_definitions(-DFUSION_POOLBN_OP)
-endif()
-if (FUSION_ELEMENTWISEADDRELU_OP)
-  add_definitions(-DFUSION_ELEMENTWISEADDRELU_OP)
-endif()
-if (FUSION_CONVBN_OP)
-  add_definitions(-DFUSION_CONVBN_OP)
-endif()
-
-if (CONV_TRANSPOSE_OP)
-  add_definitions(-DCONV_TRANSPOSE_OP)
-endif()
-
-if (LOOKUP_OP)
-  add_definitions(-DLOOKUP_OP)
-endif()
-
-if (GRU_OP)
-  add_definitions(-DGRU_OP)
-endif()
-
-if (GRU_UNIT_OP)
-  add_definitions(-DGRU_UNIT_OP)
-endif()
-
-if (CRF_OP)
-  add_definitions(-DCRF_OP)
-endif()
-
-
-if (FLATTEN_OP)
-  add_definitions(-DFLATTEN_OP)
-endif()
-
-if (FLATTEN2_OP)
-  add_definitions(-DFLATTEN2_OP)
-endif()
-
-if (SPLIT_OP)
-  add_definitions(-DSPLIT_OP)
-endif()
-
-if (BILINEAR_INTERP_OP)
-  add_definitions(-DBILINEAR_INTERP_OP)
-endif()
-
-if (SHAPE_OP)
-  add_definitions(-DSHAPE_OP)
-endif()
-
-if (ELEMENTWISEMUL_OP)
-  add_definitions(-DELEMENTWISEMUL_OP)
-endif()
-if (SUM_OP)
-  add_definitions(-DSUM_OP)
-endif()
-if (TOP_K_OP)
-  add_definitions(-DTOP_K_OP)
-endif()
-if (CAST_OP)
-  add_definitions(-DCAST_OP)
-endif()
-if (QUANT_OP)
-  add_definitions(-DQUANT_OP)
-endif()
-if (DEQUANT_OP)
-  add_definitions(-DDEQUANT_OP)
-endif()
-if (FUSION_DEQUANT_BN_OP)
-  add_definitions(-DFUSION_DEQUANT_BN_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_OP)
-  add_definitions(-DFUSION_DEQUANT_ADD_BN_OP)
-endif()
-if (FUSION_DEQUANT_BN_RELU_OP)
-  add_definitions(-DFUSION_DEQUANT_BN_RELU_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_RELU_OP)
-  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_QUANT_OP)
-#  add_definitions(-DFUSION_DEQUANT_ADD_BN_QUANT_OP)
-endif()
-if (FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-#  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
-endif()
-if (SEQUENCE_EXPAND_OP)
-  add_definitions(-DSEQUENCE_EXPAND_OP)
-endif()
-if (SEQUENCE_POOL_OP)
-  add_definitions(-DSEQUENCE_POOL_OP)
-endif()
-if (SEQUENCE_SOFTMAX_OP)
-  add_definitions(-DSEQUENCE_SOFTMAX_OP)
-endif()
-if (LOG_OP)
-  add_definitions(-DLOG_OP)
-endif()
-if (LOD_RESET_OP)
-  add_definitions(-DLOD_RESET_OP)
-endif()
-if (LESS_THAN_OP)
-  add_definitions(-DLESS_THAN_OP)
-endif()
-if (LOGICAL_AND_OP)
-  add_definitions(-DLOGICAL_AND_OP)
-endif()
-if (LOGICAL_OR_OP)
-  add_definitions(-DLOGICAL_OR_OP)
-endif()
-if (LOGICAL_NOT_OP)
-  add_definitions(-DLOGICAL_NOT_OP)
-endif()
-if (LOGICAL_XOR_OP)
-  add_definitions(-DLOGICAL_XOR_OP)
-endif()
-
-if (TANH_OP)
-  add_definitions(-DTANH_OP)
-endif()
-if (FUSION_DECONVRELU_OP)
-  add_definitions(-DFUSION_DECONVRELU_OP)
-endif()
-if (FUSION_DECONVADD_OP)
-  add_definitions(-DFUSION_DECONVADD_OP)
-endif()
-if (FUSION_DECONVADDRELU_OP)
-  add_definitions(-DFUSION_DECONVADDRELU_OP)
-endif()
-if (WHILE_OP)
-  add_definitions(-DWHILE_OP)
-endif()
-if (WRITE_TO_ARRAY_OP)
-  add_definitions(-DWRITE_TO_ARRAY_OP)
-endif()
-if (READ_FROM_ARRAY_OP)
-  add_definitions(-DREAD_FROM_ARRAY_OP)
-endif()
-if (IS_EMPTY_OP)
-  add_definitions(-DIS_EMPTY_OP)
-endif()
-if (INCREMENT_OP)
-  add_definitions(-DINCREMENT_OP)
-endif()
-
-if (ANCHOR_GENERATOR_OP)
-  add_definitions(-DANCHOR_GENERATOR_OP)
-endif()
-if (PROPOSAL_OP)
-  add_definitions(-DPROPOSAL_OP)
-endif()
-if (PSROI_POOL_OP)
-  add_definitions(-DPSROI_POOL_OP)
-endif()
-if (ROIALIGN_POOL_OP)
-  add_definitions(-DROIALIGN_POOL_OP)
-endif()
-if (ROI_PERSPECTIVE_OP)
-  add_definitions(-DROI_PERSPECTIVE_OP)
-endif()
-if (BEAM_SEARCH_OP)
-  add_definitions(-DBEAM_SEARCH_OP)
-endif()
-if (BEAM_SEARCH_DECODE_OP)
-  add_definitions(-DBEAM_SEARCH_DECODE_OP)
-endif()
-if (FUSION_DECONVADDBNRELU_OP)
-  add_definitions(-DFUSION_DECONVADDBNRELU_OP)
-endif()
-if (FUSION_DECONVBNRELU_OP)
-  add_definitions(-DFUSION_DECONVBNRELU_OP)
-endif()
-if (FUSION_DECONVADDBN_OP)
-  add_definitions(-DFUSION_DECONVADDBN_OP)
-endif()
-if (PAD2D_OP)
-  add_definitions(-DPAD2D_OP)
-endif()
-if (ONE_HOT_OP)
-  add_definitions(-DONE_HOT_OP)
-endif()
-if (ASSIGN_VALUE_OP)
-  add_definitions(-DASSIGN_VALUE_OP)
-endif()
-if (LEAKY_RELU_OP)
-  add_definitions(-DLEAKY_RELU_OP)
-endif()
-if (NEAREST_INTERP_OP)
-  add_definitions(-DNEAREST_INTERP_OP)
-endif()
-if (DENSITY_PRIORBOX_OP)
-  add_definitions(-DDENSITY_PRIORBOX_OP)
-endif()
-if (EXP_OP)
-  add_definitions(-DEXP_OP)
-endif ()
-if (ASSIGN_OP)
-  add_definitions(-DASSIGN_OP)
-endif()
-if (CONDITIONAL_BLOCK_OP)
-  add_definitions(-DCONDITIONAL_BLOCK_OP)
-endif()
-if (EQUAL_OP)
-  add_definitions(-DEQUAL_OP)
-endif()
-if (FILL_CONSTANT_BATCH_SIZE_LIKE_OP)
-  add_definitions(-DFILL_CONSTANT_BATCH_SIZE_LIKE_OP)
-endif()
-if (RANGE_OP)
-  add_definitions(-DRANGE_OP)
-endif()
-if (REDUCE_PROD_OP)
-  add_definitions(-DREDUCE_PROD_OP)
-endif()
-if (PIXEL_SHUFFLE_OP)
-  add_definitions(-DPIXEL_SHUFFLE_OP)
-endif()
-if (EXPAND_OP)
-  add_definitions(-DEXPAND_OP)
-endif()
-if (GRID_SAMPLER_OP)
-  add_definitions(-DGRID_SAMPLER_OP)
-endif()
-if (BLOG)
-  add_definitions(-DBLOG)
-endif()
-
diff --git a/mobile/tools/pre-commit.hooks/clang-format.hook b/mobile/tools/pre-commit.hooks/clang-format.hook
deleted file mode 100644
index ffba8744f4b96c53907f7848592418e4356bf6bb..0000000000000000000000000000000000000000
--- a/mobile/tools/pre-commit.hooks/clang-format.hook
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# set -e
-
-readonly VERSION="5.0"
-
-version=$(clang-format -version)
-
-if ! [[ $version == *"$VERSION"* ]]; then
-    echo "clang-format version check failed."
-    echo "a version contains '$VERSION' is needed, but get '$version'"
-    echo "you can install the right version, and make an soft-link to '\$PATH' env"
-    exit -1
-fi
-
-# https://medicineyeh.wordpress.com/2017/07/13/clang-format-with-pragma/
-shift
-perl -i -pe 's|^\s+#pragma\s+omp|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> #pragma omp|' "$@"
-(
-# remove clang format ios_io folder
-flist=$(echo "$@" | perl -pe 's|src/io/ios_io/[^ ]*||')
-clang-format -i $flist
-)
-perl -i -pe 's|// <TRICKY-CLANG-FORMAT-PRAGMA-FIX> ||' "$@"
diff --git a/mobile/tools/pre-commit.hooks/clang-tidy.hook b/mobile/tools/pre-commit.hooks/clang-tidy.hook
deleted file mode 100755
index 2d7847c3305d884dc0acfeec2a3bf415c412cc4b..0000000000000000000000000000000000000000
--- a/mobile/tools/pre-commit.hooks/clang-tidy.hook
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-
-bash -c "cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON"
-
-TOTAL_ERRORS=0
-
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep "src/" | grep -v ".pb." | grep -v ".h"); do
-    echo "clang-tidy check $file";
-    clang-tidy $file --fix --fix-errors --header-filter=.* 
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-    echo "clang-tidy error TOTAL_ERRORS = $TOTAL_ERRORS . "
-done
-
-rm -f compile_commands.json
-
-exit $TOTAL_ERRORS
-
diff --git a/mobile/tools/pre-commit.hooks/copyright.hook b/mobile/tools/pre-commit.hooks/copyright.hook
deleted file mode 100644
index 8fc0028059c8e841ceb7a70368563f54a4584b06..0000000000000000000000000000000000000000
--- a/mobile/tools/pre-commit.hooks/copyright.hook
+++ /dev/null
@@ -1,124 +0,0 @@
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import argparse
-import io
-import platform
-import re
-import subprocess
-
-COPYRIGHT = '''
-Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-'''
-
-LANG_COMMENT_MARK = None
-
-NEW_LINE_MARK = None
-
-COPYRIGHT_HEADER = None
-
-if platform.system() == "Windows":
-    NEW_LINE_MARK = "\r\n"
-else:
-    NEW_LINE_MARK = '\n'
-    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
-    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
-    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
-    date, err = process.communicate()
-    date = date.decode("utf-8").rstrip("\n")
-    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
-
-
-def generate_copyright(template, lang='C'):
-    end_line = ""
-    if lang == 'Python':
-        lang_coment_mark = '# '
-        start = lang_coment_mark
-        blank = " "
-    else:
-        lang_coment_mark = ""
-        start = "/* "
-        blank = ""
-        end_line = " */"
-    lines = template.split(NEW_LINE_MARK)
-
-    ans = start + blank + COPYRIGHT_HEADER + NEW_LINE_MARK
-
-    for lino, line in enumerate(lines):
-        if lino == 0 or lino == 1 or lino == len(lines) - 1:
-            continue
-        if lino == (len(lines) - 2):
-            ans += lang_coment_mark + blank + line + end_line + NEW_LINE_MARK
-        else:
-            ans += lang_coment_mark + blank + line + NEW_LINE_MARK
-    return ans + "\n"
-
-
-def lang_type(filename):
-    if filename.endswith(".py"):
-        return "Python"
-    elif filename.endswith(".h"):
-        return "C"
-    elif filename.endswith(".c"):
-        return "C"
-    elif filename.endswith(".hpp"):
-        return "C"
-    elif filename.endswith(".cc"):
-        return "C"
-    elif filename.endswith(".cpp"):
-        return "C"
-    else:
-        print("Unsupported filetype %s", filename)
-        exit(0)
-
-
-PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
-
-
-def main(argv=None):
-    parser = argparse.ArgumentParser(
-        description='Checker for copyright declaration.')
-    parser.add_argument('filenames', nargs='*', help='Filenames to check')
-    args = parser.parse_args(argv)
-
-    retv = 0
-    for filename in args.filenames:
-        fd = io.open(filename, encoding="utf-8")
-        first_line = fd.readline()
-        second_line = fd.readline()
-        if "COPYRIGHT (C)" in first_line.upper() or "COPYRIGHT (C)" in second_line.upper():
-            continue
-        if first_line.startswith("/*") or first_line.startswith("#!") or PYTHON_ENCODE.match(
-                second_line) is not None or PYTHON_ENCODE.match(first_line) is not None:
-            continue
-        original_contents = io.open(filename, encoding="utf-8").read()
-        new_contents = generate_copyright(
-            COPYRIGHT, lang_type(filename)) + original_contents
-        print('Auto Insert Copyright Header {}'.format(filename))
-        retv = 1
-        with io.open(filename, 'w') as output_file:
-            output_file.write(new_contents)
-    return retv
-
-
-def test_generate_copyright():
-    print(generate_copyright(COPYRIGHT))
-
-
-if __name__ == '__main__':
-    # test_generate_copyright()
-    exit(main())
-
diff --git a/mobile/tools/pre-commit.hooks/cpplint.hook b/mobile/tools/pre-commit.hooks/cpplint.hook
deleted file mode 100644
index 3740e64c7331e63954fc85f8958b7613e48cce57..0000000000000000000000000000000000000000
--- a/mobile/tools/pre-commit.hooks/cpplint.hook
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-TOTAL_ERRORS=0
-
-# The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
-        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
-        grep -v "protobuf-c.h" | grep -v "protobuf-c.c" | grep -v "^mobile/tools/quantification"); do
-    cpplint $file;
-    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
-done
-
-exit $TOTAL_ERRORS
diff --git a/mobile/tools/prepare_images_and_models.sh b/mobile/tools/prepare_images_and_models.sh
deleted file mode 100755
index 6f224778d9014a661940eae0cb6bb375846dc204..0000000000000000000000000000000000000000
--- a/mobile/tools/prepare_images_and_models.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-# decalre download paths of images and models
-PADDLE_MOBILE_ROOT="$(pwd)/../"
-IMAGES_AND_MODELS="opencl_test_src"
-IMAGES_AND_MODELS_PATH="http://mms-graph.bj.bcebos.com/paddle-mobile/${IMAGES_AND_MODELS}.zip"
-
-# download and unzip zip-files of images and models
-mkdir ${PADDLE_MOBILE_ROOT}/download/
-cd ${PADDLE_MOBILE_ROOT}/download/
-wget -c ${IMAGES_AND_MODELS_PATH}
-unzip -o ./${IMAGES_AND_MODELS}.zip
-
-# create models and images directories below test
-mkdir ${PADDLE_MOBILE_ROOT}/test/models
-mkdir ${PADDLE_MOBILE_ROOT}/test/images
-
-# move to test directory
-cp ./${IMAGES_AND_MODELS}/input_3x224x224_banana ${PADDLE_MOBILE_ROOT}/test/images/
-cp -r ./${IMAGES_AND_MODELS}/mobilenet ${PADDLE_MOBILE_ROOT}/test/models/
diff --git a/mobile/tools/profile_show.sh b/mobile/tools/profile_show.sh
deleted file mode 100644
index d4a4d84e9d2ba700a8d5562a18ae82f2241b86af..0000000000000000000000000000000000000000
--- a/mobile/tools/profile_show.sh
+++ /dev/null
@@ -1,138 +0,0 @@
-#!/usr/bin/env sh
-cat <<EOF
-<html>
-<head>
-<style>
-html, body {
-position: absolute;
-width: 100%;
-height: 100%;
-margin: 0;
-}
-div.timeview {
-width: 100%;
-position: relative;
-overflow: scroll;
-}
-ul {
-position: absolute;
-margin: 0;
-list-style:none;
-padding: 0;
-margin: 0;
-}
-li {
-height: 15px;
-position: absolute;
-background: blue;
-}
-li:nth-child(odd) {
-background: blue;
-}
-li:nth-child(even) {
-background: rebeccapurple;
-}
-ul.timeline {
-z-index: -1;
-}
-ul.timeline li {
-position: relative;
-height: 15px;
-width: 100%;
-}
-ul.timeline li:nth-child(odd) {
-background: beige;
-}
-ul.timeline li:nth-child(even) {
-background: antiquewhite;
-}
-</style>
-</head>
-<body>
-<div class="timeview">
-<ul>
-EOF
-
-min=$(awk 'NR==1{min=$4} NR>1{if($4 < min) min=$4} END{print min}' $1)
-max=$(awk 'NR==1{max=$5} NR>1{if($5 > max) max=$5} END{print max}' $1)
-sort $1 -k1,1n | awk -v max="$max" -v min="$min" '
-BEGIN {
-  total = max - min
-}
-{
-    opid = $1
-    optype = $2
-    tid = $3
-    cb = $4
-    ce = $5
-    cl = $6
-    sum += $4 - $3
-    print "<li class=\"timeline\"" \
-          " data-opid=\"" opid "\"" \
-          " data-optype=\"" optype "\"" \
-          " data-tid=\"" tid "\"" \
-          " data-begin=\"" cb "\"" \
-          " data-end=\"" ce "\"" \
-          "></li>"
-}
-'
-
-cat <<EOF
-</ul>
-</div>
-<pre>
-EOF
-
-echo "==================[ profile ]==================="
-cat $1 | awk '
-NR>1{
-    optype = $2
-    sum += $5 - $4
-    count[$2] += $6
-}
-END {
-for (t in count) {
-    msg = sprintf("%-16s\t%-10d\t%-.4f", t, count[t], count[t]*100 / sum);
-    print msg
-}
-}' | sort -k2,2nr
-cat $1 | awk '
-NR>1{
-    sum += $5 - $4
-}
-END {
-msg = sprintf("%-16s\t%-10d\t%-.4f", "total", sum, 100);
-print msg
-}'
-
-cat <<EOF
-</pre>
-<script>
-const min= $min;
-const max= $max;
-const px_per_nanosecond = 1/1000000;
-const scale = px_per_nanosecond;
-const li = document.querySelectorAll('li');
-const thread = new Set();
-for (let i = 0; i < li.length; i++) {
-    const prof = li[i].dataset;
-    li[i].style.width = (prof.end - prof.begin)*scale + 'px';
-    li[i].style.left = (prof.begin - min)*scale + 'px';
-    li[i].style.top = prof.tid * 15 + 'px';
-    thread.add(prof.tid);
-}
-const ul = document.createElement('ul');
-ul.classList.add('timeline');
-ul.style.width = (max - min)*scale + 'px';
-thread.forEach(i => {
-    const l = document.createElement('li');
-    ul.appendChild(l);
-});
-const timeview = document.querySelector('.timeview');
-timeview.appendChild(ul);
-timeview.style.height = thread.size * 15 + 'px';
-
-</script>
-</body>
-</html>
-EOF
diff --git a/mobile/tools/python/caffetools/run.py b/mobile/tools/python/caffetools/run.py
deleted file mode 100644
index 914ec83f0f0a11890dc6d92633acc611b717148d..0000000000000000000000000000000000000000
--- a/mobile/tools/python/caffetools/run.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import caffe
-import numpy as np
-
-prototxt_path = ""
-caffemodel_path = ""
-input_path = "input.txt"
-input_name = ""
-output_name = ""
-
-shape = (1, 3, 64, 64)
-
-data = np.loadtxt(input_path).astype("float32").reshape(shape)
-
-net = caffe.Net(prototxt_path, caffemodel_path, caffe.TEST)
-
-# view inputs blob names
-print(net.inputs)
-
-# view outputs blob names
-print(net.outputs)
-
-# set input data
-net.blobs[input_name].reshape(*shape)
-net.blobs[input_name].data[...] = data
-
-# predict
-net.forward()
-
-# view output data
-print(net.blobs[output_name].data)
diff --git a/mobile/tools/python/fluidtools/.gitignore b/mobile/tools/python/fluidtools/.gitignore
deleted file mode 100644
index a8dcab2592cd52969689765650ebc45dfd4c9c96..0000000000000000000000000000000000000000
--- a/mobile/tools/python/fluidtools/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-*
-!run.py
-!.gitignore
-!/model-encrypt-tool
-!test_wrap.py
-!run_multi_feed.py
diff --git a/mobile/tools/python/fluidtools/run.py b/mobile/tools/python/fluidtools/run.py
deleted file mode 100644
index 6f82e426bd1ab1e376783c0d1015e625d7d47068..0000000000000000000000000000000000000000
--- a/mobile/tools/python/fluidtools/run.py
+++ /dev/null
@@ -1,675 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "model"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-quantification = False
-quantification_fold = 1000
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-correct_persistable = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    global correct_persistable
-    if correct_persistable:
-        ops = prog.current_block().ops
-        vars = prog.current_block().vars
-        for op in ops:
-            for var_name in op.output_arg_names:
-                if var_name == "fetch":
-                    continue
-                var = vars[var_name]
-                if var.persistable:
-                    pp_red("has found non-persistable output var : {}".format(var_name))
-                    var.persistable = False
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if not quantification:
-        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params，加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状，放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var，全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    # for line in lines:
-    #     print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    fetch_diff = 0.0
-    fetch_count = 0
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        for i in range(len(values1)):
-            v1 = values1[i]
-            v2 = values2[len(shape) + i]
-            fetch_diff += abs(v1 - v2)
-            fetch_count += 1
-    if fetch_count != 0:
-        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    if not fast_check:
-        check_mobile_results(args, False, False)
-        check_mobile_results(args, False, True)
-    check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/fluidtools/run_multi_feed.py b/mobile/tools/python/fluidtools/run_multi_feed.py
deleted file mode 100644
index 6f706a2e22c26d2eab91c9bec8cc1220a9439a83..0000000000000000000000000000000000000000
--- a/mobile/tools/python/fluidtools/run_multi_feed.py
+++ /dev/null
@@ -1,695 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "erciyuan"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-quantification = False
-quantification_fold = 1000
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-correct_persistable = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-feed_names_ = []
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    global correct_persistable
-    if correct_persistable:
-        ops = prog.current_block().ops
-        vars = prog.current_block().vars
-        for op in ops:
-            for var_name in op.output_arg_names:
-                if var_name == "fetch":
-                    continue
-                var = vars[var_name]
-                if var.persistable:
-                    pp_red("has found non-persistable output var : {}".format(var_name))
-                    var.persistable = False
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if not quantification:
-        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params，加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    pp_yellow("run_model", 1)
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-
-    feed_names_.clear()
-    for feed_name in feeds:
-        feed_names_.append(feed_name)
-        pp_green(feed_name, 1)
-
-
-    pp_green(feed_names_, 1)
-
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状，放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var，全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
-    pp_green(args, 1)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net-feeds {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-    # for line in lines:
-    #     if line.startswith("auto-test-debug"):
-    #         print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    fetch_diff = 0.0
-    fetch_count = 0
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        for i in range(len(values1)):
-            v1 = values1[i]
-            v2 = values2[len(shape) + i]
-            fetch_diff += abs(v1 - v2)
-            fetch_count += 1
-    if fetch_count != 0:
-        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-
-    pp_green(feed_names_, 1)
-    feed_names_argu = ""
-    for n in feed_names_:
-        feed_names_argu += "{}\n".format(n)
-        pp_green("feed name - {} ".format(str(n)), 1)
-        push(feed_path + "/" + str(n), "{}".format(str(n)))
-
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    if not fast_check:
-        check_mobile_results(args, False, False)
-        check_mobile_results(args, False, True)
-    check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/fluidtools/test_wrap.py b/mobile/tools/python/fluidtools/test_wrap.py
deleted file mode 100644
index 527a5a6584dc9e2e0b235ef87829c945b3c02f21..0000000000000000000000000000000000000000
--- a/mobile/tools/python/fluidtools/test_wrap.py
+++ /dev/null
@@ -1,546 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "yolov2"
-checked_model_path = "checked_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.05
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params，加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
-               "checked_model/model.ml".format(line))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状，放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var，全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    for fetch_name in fetch_names:
-        output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-            error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    print("")
-    print("==================================================")
-    print("")
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    push(checked_model_path)
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/arm-v7a/build/libpaddle-mobile.so")
-    push(mobile_src_root + "/build/release/arm-v7a/build/cl_kernel")
-    push(mobile_src_root + "/test/build/test-wrap")
-    res = sh("adb shell 'cd {} && export LD_LIBRARY_PATH=. && ./test-wrap'".format(mobile_exec_root))
-    lines = res.split("\n")
-    for line in lines:
-        print(line)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/python/imagetools/README.md b/mobile/tools/python/imagetools/README.md
deleted file mode 100644
index 91106c80089cb63e56989834a9c38bf7356d9192..0000000000000000000000000000000000000000
--- a/mobile/tools/python/imagetools/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# imagetools
-
-This directory contains scripts generating input data file for paddle-mobile. The image data `g_test_image_1x3x224x224_banana` (used by `test/net/test_mobilenet.cpp`) of [http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip) is generated by this script.
-
-## Generate Input
-
-Edit script `img2nchw.py` as below according to your need:
-
-```python
-if __name__ == "__main__":
-    # set paras
-    input_image_path = 'banana.jpg'
-    reshape_dict = {"n":1, "c":3, "h":48, "w":512}
-    output_path = input_image_path.replace(input_image_path[-4:],
-                                           "_" + "_".join([str(reshape_dict['n']),
-                                                           str(reshape_dict['c']),
-                                                           str(reshape_dict['h']),
-                                                           str(reshape_dict['w']),
-                                                           "nchw",
-                                                           "float"],))
-    channel_type = ChannelType.BGR
-    mean_bgr = (103.94, 116.78, 123.68) # (0, 0, 0)
-    pixel_scale = 0.017
-```
diff --git a/mobile/tools/python/imagetools/imagetools.py b/mobile/tools/python/imagetools/imagetools.py
deleted file mode 100644
index 2d0864d729bdc7c1eef4534938f1dfd41cc82b7c..0000000000000000000000000000000000000000
--- a/mobile/tools/python/imagetools/imagetools.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-
-
-def resize_take_rgbs(path, shape_h_w, SHOW_IMG=False):
-    print("[INFO] ---- resize_take_rgbs ---- start")
-
-    image = cv2.imread(path)
-    print("[INFO] image.shape:{}".format(image.shape))
-    print("[INFO] shape_h_w:{}".format(shape_h_w))
-
-    if SHOW_IMG:
-        cv2.imshow("before", image)
-
-    print_rgb(image[0, 0])
-    # image len may be for .just check it
-    # image.resize(shape_h_w)
-
-    image = cv2.resize(image, (shape_h_w[0], shape_h_w[1]))
-
-    if SHOW_IMG:
-        cv2.imshow("after", image)
-
-    print("[INFO] resized image.shape:{}".format(image.shape))
-    height = shape_h_w[0]
-    width = shape_h_w[1]
-
-    rs_ = []
-    gs_ = []
-    bs_ = []
-    for h in range(0, height):
-        for w in range(0, width):
-            '''
-            bs_.append(image[h, w, 0])
-            gs_.append(image[h, w, 1])
-            rs_.append(image[h, w, 2])
-            '''
-            bs_.append(image[w, h, 0])
-            gs_.append(image[w, h, 1])
-            rs_.append(image[w, h, 2])
-
-    # print image[2, 2, 0]/255.
-    print len(bs_)
-    print len(gs_)
-    print len(rs_)
-    print("[INFO] ---- resize_take_rgbs ---- end")
-    return bs_, gs_, rs_
-
-
-def print_rgb((b, g, r)):
-    print "像素 - R:%d,G:%d,B:%d" % (r, g, b)  # 显示像素值
-    #
-    # image[0, 0] = (100, 150, 200)  # 更改位置(0,0)处的像素
-    #
-    # (b, g, r) = image[0, 0]  # 再次读取(0,0)像素
-    # print "位置(0,0)处的像素 - 红:%d,绿:%d,蓝:%d" % (r, g, b)  # 显示更改后的像素值
-    #
-    # corner = image[0:100, 0:100]  # 读取像素块
-    # cv2.imshow("Corner", corner)  # 显示读取的像素块
-    #
-    # image[0:100, 0:100] = (0, 255, 0);  # 更改读取的像素块
-    #
-    # cv2.imshow("Updated", image)  # 显示图像
-    #
-    # cv2.waitKey(0)  # 程序暂停
-
-
-def save_to_file(to_file_name, array):
-    with open(to_file_name, "wb") as file_handle:
-        array.tofile(file_handle)
diff --git a/mobile/tools/python/imagetools/img2nchw.py b/mobile/tools/python/imagetools/img2nchw.py
deleted file mode 100644
index f8e7c74a9dc9ccb4d6a9e2b37bcae8a0c9ae9ca2..0000000000000000000000000000000000000000
--- a/mobile/tools/python/imagetools/img2nchw.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-import imagetools as tools
-from enum import Enum
-
-
-class ChannelType(Enum):
-    RGB = 0,
-    BGR = 1
-
-def combine_bgrs_nchw(bgrs, means_b_g_r=(103.94, 116.78, 123.68), scale=0.017, channel_type=ChannelType.BGR):
-    print("[INFO] ---- combine_bgrs_nchw ---- start")
-    print("[INFO] scale:{}".format(scale))
-    print("[INFO] mean_b_g_r:{}".format(means_b_g_r))
-    #print("[INFO] bgrs:{}".format(bgrs))
-
-    bs = bgrs[0]
-    gs = bgrs[1]
-    rs = bgrs[2]
-    assert len(bs) == len(gs) == len(rs)
-    print("[INFO] element size of blue channel = len(bs) = {}".format(len(bs)))
-
-    bgrs_float_array = array('f')
-    if channel_type == ChannelType.BGR:
-        print('[INFO] bgr format')
-        for i in range(0, len(bs)):
-            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-        for i in range(0, len(gs)):
-            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        for i in range(0, len(rs)):
-            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-    elif channel_type == ChannelType.RGB:
-        print('[INFO] rgb format')
-        for i in range(0, len(rs)):
-            bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-        for i in range(0, len(gs)):
-            bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        for i in range(0, len(bs)):
-            bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-
-    '''
-    print("lenI(bgrs_float_array)={}".format(len(bgrs_float_array)))
-    print '------------------'
-    print bgrs_float_array[0]
-    print bgrs_float_array[224 * 224 * 2 + 224 * 2 + 2]
-    # for i in range(0, 9):
-    #     print'bs %d' % i
-    #     print bs[i] / 255.
-    print bs[224 * 2 + 2] / 255.
-    '''
-    print("[INFO] ---- combine_bgrs_nchw ---- end")
-    return bgrs_float_array
-
-
-if __name__ == "__main__":
-    # set paras
-    #input_image_path = 'banana.jpg'
-    #input_image_path = "ocr_detect_512x512.png"
-    input_image_path = "ocr_recog_48x512.png"
-
-    reshape_dict = {"n":1, "c":3, "h":48, "w":512}
-    output_path = input_image_path.replace(input_image_path[-4:],
-                                           "_" + "_".join([str(reshape_dict['n']),
-                                                           str(reshape_dict['c']),
-                                                           str(reshape_dict['h']),
-                                                           str(reshape_dict['w']),
-                                                           "nchw",
-                                                           "float"],))
-    channel_type = ChannelType.BGR
-    mean_bgr = (103.94, 116.78, 123.68)
-    pixel_scale = 0.017
-    #mean_bgr = (0, 0, 0)
-    #pixel_scale = 1. / 255
-
-    print("[INFO] input_image_path:{}".format(input_image_path))
-    print("[INFO] reshape_dict:{}".format(reshape_dict))
-    print("[INFO] output_path:{}".format(output_path))
-    print("[INFO] mean_bgr:{}".format(mean_bgr))
-    print("[INFO] pixel_scale:{}".format(pixel_scale))
-
-    bgrs = tools.resize_take_rgbs(input_image_path, (reshape_dict['h'],
-                                                     reshape_dict['w'],
-                                                     reshape_dict['c']))
-    array = combine_bgrs_nchw(bgrs, mean_bgr, pixel_scale, channel_type)
-    tools.save_to_file(output_path, array)
-    print("[INFO] save {} successfully".format(output_path))
-    #cv2.waitKey(0)
diff --git a/mobile/tools/python/imagetools/img2nhwc.py b/mobile/tools/python/imagetools/img2nhwc.py
deleted file mode 100644
index c982fe303ecde08a9de1827ca67024567322d47f..0000000000000000000000000000000000000000
--- a/mobile/tools/python/imagetools/img2nhwc.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# coding=utf-8
-import cv2
-from array import array
-import imagetools as tools
-
-
-def combine_bgrs_nhwc(bgrs, means_b_g_r, scale):
-    print "scale: %f" % scale
-    print means_b_g_r
-    # print len(bgrs)
-    bs = bgrs[0]
-    gs = bgrs[1]
-    rs = bgrs[2]
-    assert len(bs) == len(gs) == len(rs)
-    # print len(bs)
-    bgrs_float_array = array('f')
-    for i in range(0, len(bs)):
-        bgrs_float_array.append((rs[i] - means_b_g_r[2]) * scale)  # r
-        bgrs_float_array.append((gs[i] - means_b_g_r[1]) * scale)  # g
-        bgrs_float_array.append((bs[i] - means_b_g_r[0]) * scale)  # b
-
-    print len(bgrs_float_array)
-
-    print '------------------'
-    print bgrs_float_array[0]
-    print bgrs_float_array[999]
-    return bgrs_float_array
-
-
-bgrs = tools.resize_take_rgbs('newyolo_1.jpg', (416, 416, 3))
-array = combine_bgrs_nhwc(bgrs, (0, 0, 0), 1.0 / 255)
-tools.save_to_file('desktop_1_3_416_416_nhwc_float', array)
-
-cv2.waitKey(0)
diff --git a/mobile/tools/python/imagetools/numpy2binary.py b/mobile/tools/python/imagetools/numpy2binary.py
deleted file mode 100644
index 9d9a7d0c86c8c70d2e97735c42ab6390616906f9..0000000000000000000000000000000000000000
--- a/mobile/tools/python/imagetools/numpy2binary.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-# coding=utf-8
-
-# This script convert numpy format to binary's
-import cv2
-import numpy as np
-import imagetools as tools
-from array import array
-
-
-'''
-image = cv2.imread(path)
-print image.shape
-print_rgb(image[0, 0])
-# mage len may be for .just check it
-image.resize(shape_h_w)
-'''
-
-if __name__ == "__main__":
-    # input params
-    reshape_dict = {"n": 1, "c": 3, "h": 224, "w": 224}
-    np_file_path = 'banana_1_3_224_224_nchw_float'
-    save_file_name = 'in_put_1_3_224_224_nchw'
-
-    # load input etc.
-    np = np.fromfile(np_file_path, 'f')
-    #np = cv2.imread(np_file_path)
-    print("np.size:{}".format(np.size))
-    print("np:{}".format(np))
-    np.reshape(reshape_dict['n'],
-               reshape_dict['c'],
-               reshape_dict['h'],
-               reshape_dict['w'])
-    out_array = array('f')
-
-    '''
-    print("--------------------")
-    print("np.size:{}".format(np.size))
-    print("np[0]:{}".format(np[0])
-
-    print("如果是nhw")
-    # rgb rgb rgb rgb rgb
-    print np[224 * 3 * 2 + 3 * 2 + 2]
-    # print np[2]
-
-    print '如果是nchw --------'
-    # rgb rgb rgb rgb rgb
-    print(np[224 * 224 * 2 + 224 * 2 + 2])
-    # print np[2]
-    # 明明是nchw
-    '''
-
-    for i in range(0, np.size):
-        out_array.append(np[i])
-
-    print("len(out_array):{}".format(len(out_array)))
-    print("out_array[224 * 224 * 2 + 224 * 2 + 2]:{}".format(out_array[224 * 224 * 2 + 224 * 2 + 2]))
-
-    # print out_array
-    tools.save_to_file(save_file_name, out_array)
diff --git a/mobile/tools/python/misc/.gitignore b/mobile/tools/python/misc/.gitignore
deleted file mode 100644
index 2414d1177a127a14dfe2f2bd43c6ad6a355acf6e..0000000000000000000000000000000000000000
--- a/mobile/tools/python/misc/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-0
-1
-images
-__pycache__
diff --git a/mobile/tools/python/misc/fluidtools.py b/mobile/tools/python/misc/fluidtools.py
deleted file mode 100644
index 3032fd5490b8037e3b9ad6a8d6b62b2ed37b5b09..0000000000000000000000000000000000000000
--- a/mobile/tools/python/misc/fluidtools.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import struct
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-fast_check = False
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-ops = None
-def check_model(model_path, dump_data_and_model):
-    check_model_impl(model_path, dump_data_and_model, True)
-    return check_model_impl(model_path, dump_data_and_model, False)
-
-def check_model_impl(model_path, dump_data_and_model, need_check):
-    global ops
-    if need_check:
-        prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    else:
-        prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model-checked", params_filename="params-checked")
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    
-    # 获取变量形状
-    def get_var_shape(var_name):
-        vars = prog.current_block().vars
-        shape = vars[var_name].desc.shape()
-        for i in range(len(shape)):
-            dim = shape[i]
-            if dim == -1:
-                shape[i] = 1
-        return shape
-    
-    # 获取输入变量形状
-    def get_feed_var_shape(var_name):
-        # 如果想写死输入形状，放开以下语句
-        # return [1, 3, 224, 224]
-        return get_var_shape(var_name)
-
-    # 生成feed的key-value对
-    def gen_feed_kv():
-        feed_kv = {}
-        for feed_name in feeds:
-            feed_shape = get_feed_var_shape(feed_name)
-            data = np.random.random(feed_shape).astype("float32")
-            feed_kv[feed_name] = data
-        return feed_kv
-
-    feed_kv = gen_feed_kv()
-
-    # 运行模型
-    def run_model(feed_kv=None):
-        if feed_kv is None:
-            feed_kv = gen_feed_kv()
-        outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-        results = []
-        for output in outputs:
-            results.append(np.array(output))
-        return results
-
-    # 获取var的数据
-    def get_var_data(var_name, feed_kv=None):
-        # 强制var为可持久化
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            v.persistable = True
-        # outputs = run_model(feed_kv=feed_kv)
-        output = np.array(fluid.global_scope().find_var(var_name).get_tensor())
-        # 恢复var的可持久化属性
-        v.persistable = persistable
-        return output
-
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if need_check and dump_data_and_model:
-        fluid.io.save_inference_model(dirname=model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model-checked", params_filename="params-checked")
-        return
-    var_cache = {}
-    # 获取每层输出的数据
-    def save_all_op_output(feed_kv=None):
-        output_path = "{}/data".format(model_path)
-        if not os.path.exists(output_path):
-            os.mkdir(output_path)
-        ops = prog.current_block().ops
-        fetch_names = []
-        for fetch in fetches:
-            fetch_names.append(fetch.name)
-        feed_names = feeds
-        for i in range(len(ops)):
-            op = ops[i]
-            var_name = None
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-            real_var_name = None
-            if op.type == "fetch":
-                for name in op.input_arg_names:
-                    real_var_name = name
-                    if "tmp" in name:
-                        break
-            else:
-                real_var_name = var_name
-            if fast_check:
-                if var_name not in fetch_names and var_name not in feed_names:
-                    continue
-            try:
-                shape = get_var_shape(var_name)
-                var_cache[var_name] = shape
-            except:
-                pass
-            if not dump_data_and_model:
-                continue
-            try:
-                np_data = get_var_data(real_var_name, feed_kv=feed_kv)
-                index = -1
-                for i in range(len(fetch_names)):
-                    if real_var_name == fetch_names[i]:
-                        index = i
-                        break
-                if index != -1:
-                    np_data = outputs[index]
-                data = np_data.flatten().tolist()
-                file_name = var_name.replace("/", "_")
-                var_path = output_path + "/" + file_name
-                np_data.tofile(var_path)
-                # out_file = open(var_path, "wb")
-                # if var_name in feed_names:
-                #     for item in data:
-                #         out_file.write(struct.pack("d", item))
-                # else:
-                #     for item in data:
-                #         out_file.write(struct.pack("d", item))
-                # out_file.close()
-            except:
-                print("dump {} {} failed".format(op.type, var_name))
-                pass
-    save_all_op_output()
-    return var_cache
-
-if __name__ == "__main__":
-    model_path = "./1/mobilenet"
-    check_model(model_path, True)
diff --git a/mobile/tools/python/misc/ios-test-server.py b/mobile/tools/python/misc/ios-test-server.py
deleted file mode 100644
index fe2be5733ec19bedb612272c4ddd2f74e9c96193..0000000000000000000000000000000000000000
--- a/mobile/tools/python/misc/ios-test-server.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import qrcode
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-from flask import Flask, request, send_from_directory, jsonify, make_response
-
-# sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
-# from fluidtools import run
-from fluidtools import check_model
-
-dump_data_and_model = False
-
-def get_ip_address():
-    handle = os.popen("ifconfig | grep 172 | grep inet | grep netmask | grep broadcast | cut -d \" \" -f2")
-    ip = handle.read()
-    ip = ip.strip()
-    return ip
-
-app = Flask(__name__, static_url_path='')
-
-param_precisions = [1] # 0 for float16, 1 for float32
-
-def process_model(precision, name):
-    model_dir = "./{}/{}".format(precision, name)
-    os.chdir(model_dir)
-    os.chdir("../..")
-    var_info = check_model(model_dir, dump_data_and_model)
-    return var_info
-
-def get_model_info(precision, name):
-    # model_info = {
-    #     "name": name,
-    #     "params_precision": [precision],
-    #     "fusion": [True, False],
-    #     "reuse_texture": [True, False],
-    #     "use_mps": [True, False],
-    #     "test_performance": True,
-    #     "diff_precision": 0.01,
-    #     "vars_dic": {
-    #     }
-    # }
-    model_info = {
-        "name": name,
-        "params_precision": [precision],
-        "fusion": [True],
-        "reuse_texture": [True],
-        "use_mps": [True, False],
-        "test_performance": False,
-        "diff_precision": 0.01,
-        "vars_dic": {
-        }
-    }
-    var_info = process_model(precision, name)
-    model_info["vars_dic"] = var_info
-    return model_info
-
-model_list = []
-def process_models():
-    for precision in param_precisions:
-        model_names = os.listdir("./{}".format(precision))
-        for name in model_names:
-            model_info = get_model_info(precision, name)
-            model_list.append(model_info)
-
-@app.route('/images/<path:path>')
-def send_image(path):
-    return send_from_directory('images', path)
-
-@app.route('/getFile/<name>/model')
-def send_model(name):
-    precision = 1
-    return send_from_directory("{}/{}".format(precision, name), "model-checked")
-
-@app.route('/getFile/<name>/params/<precision>')
-def send_params(name, precision):
-    return send_from_directory("{}/{}".format(precision, name), "params-checked")
-
-@app.route('/getFile/<name>/data/<var>')
-def send_data(name, var):
-    precision = 1
-    return send_from_directory("{}/{}/data".format(precision, name), var)
-
-@app.route('/getTestInfo', methods=['GET'])
-def test_info():
-    info = {"model_list": model_list}
-    return make_response(jsonify(info), 200)
-
-test_result = None
-@app.route('/putTestResult', methods=['POST'])
-def put_test_result():
-    global test_result
-    test_result = request.get_json()
-    success = True
-    for item in test_result["results"]:
-        result = item["isResultEqual"]
-        if not result:
-            success = False
-            break
-    test_result["aaa-success"] = success
-    os.popen("open -a \"/Applications/Google Chrome.app\" \"{}/showTestResult\"".format(host))
-    return make_response(jsonify({"msg": "ok"}), 200)
-
-@app.route('/showTestResult', methods=['GET'])
-def show_test_result():
-    global test_result
-    return make_response(jsonify(test_result), 200)
-
-@app.route('/', methods=['GET'])
-def home():
-    return "<html><body><img src=\"images/qrcode.png\"/></body></html>"
-
-host = None
-
-if __name__ == "__main__":
-    process_models()
-    host = "http://{}:8080".format(get_ip_address())
-    image = qrcode.make(host)
-    if not os.path.isdir("images"):
-        os.mkdir("images")
-    image.save("images/qrcode.png")
-    os.popen("open -a \"/Applications/Google Chrome.app\" \"{}\"".format(host))
-    app.run(host="0.0.0.0", port=8080)
diff --git a/mobile/tools/python/misc/restore-git.py b/mobile/tools/python/misc/restore-git.py
deleted file mode 100644
index c0613bcb1d8796d6b0bfc6535b345661a65f0e51..0000000000000000000000000000000000000000
--- a/mobile/tools/python/misc/restore-git.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import os
-import sys
-import subprocess
-
-username = ""
-email = ""
-home = ""
-desktop = "{}/Desktop".format(home)
-dir_1 = "{}/1".format(desktop)
-dir_2 = "{}/2".format(desktop)
-src_dir = dir_1
-dest_dir = dir_2
-src_mobile_dir = "{}/paddle-mobile".format(src_dir)
-dest_mobile_dir = "{}/paddle-mobile".format(dest_dir)
-
-def clone_repo(dir):
-    os.chdir(dir)
-    os.system("git clone git@github.com:{}/paddle-mobile.git".format(username))
-    os.chdir("{}/paddle-mobile".format(dir))
-    os.system("git remote add upstream git@github.com:PaddlePaddle/paddle-mobile.git")
-    os.system("git config user.name {}".format(username))
-    os.system("git config user.email {}".format(email))
-
-def get_output(command):
-    out = subprocess.Popen(command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    stdout, stderr = out.communicate()
-    return stdout.decode("utf-8").split("\n")
-
-if __name__ == "__main__":
-    # if not os.path.isdir(src_dir):
-    #     print("dir 1 not found")
-    #     sys.exit(-1)
-    
-    if not os.path.isdir(dest_dir):
-        os.mkdir(dest_dir)
-    if not os.path.isdir(dest_mobile_dir):
-        clone_repo(dest_dir)
-    sys.exit()
-    
-    items = []
-    # items = ["metal/.gitignore", "metal/VideoSuperResolution"]
-    os.chdir(src_mobile_dir)
-    for line in get_output("git status --porcelain"):
-        line = line.strip()
-        items.append(line.split(" ")[-1])
-    
-    for item in items:
-        src = item
-        if len(src) <= 0:
-            continue
-        dest = dest_mobile_dir + "/" + item
-        cmd = "cp -R " + src + " " + dest
-        print(cmd)
-        os.system(cmd)
diff --git a/mobile/tools/python/misc/test-fluid-op-feature.py b/mobile/tools/python/misc/test-fluid-op-feature.py
deleted file mode 100644
index 1657fd247747c36a88d29685f06796f49dfffda7..0000000000000000000000000000000000000000
--- a/mobile/tools/python/misc/test-fluid-op-feature.py
+++ /dev/null
@@ -1,13 +0,0 @@
-import numpy as np
-import paddle.fluid as fluid
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-data = np.array([5.0])
-x = fluid.layers.data(name="x", shape=[1], dtype="float32")
-y = fluid.layers.relu6(x, threshold=4.0)
-
-prog = fluid.default_main_program()
-outputs = exe.run(prog, feed={"x": data}, fetch_list=[y])
-print(outputs)
diff --git a/mobile/tools/python/modeltools/.gitignore b/mobile/tools/python/modeltools/.gitignore
deleted file mode 100644
index 4108f5244bc039cb95b06e391d51250bb9d0ce42..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/.gitignore
+++ /dev/null
@@ -1,109 +0,0 @@
-# Created by .ignore support plugin (hsz.mobi)
-### Python template
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-/yolo/datas/
-/mobilenet/datas/
diff --git a/mobile/tools/python/modeltools/core/__init__.py b/mobile/tools/python/modeltools/core/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mobile/tools/python/modeltools/core/framework.proto b/mobile/tools/python/modeltools/core/framework.proto
deleted file mode 100644
index 07bfef1c2a69c236ac86732b2dbc00d8abb6334b..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/core/framework.proto
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
-package paddle_mobile.framework.proto;
-
-enum AttrType {
-  INT = 0;
-  FLOAT = 1;
-  STRING = 2;
-  INTS = 3;
-  FLOATS = 4;
-  STRINGS = 5;
-  BOOLEAN = 6;
-  BOOLEANS = 7;
-  BLOCK = 8;
-  LONG = 9;
-}
-
-// OpDesc describes an instance of a C++ framework::OperatorBase
-// derived class type.
-message OpDesc {
-
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    optional int32 i = 3;
-    optional float f = 4;
-    optional string s = 5;
-    repeated int32 ints = 6;
-    repeated float floats = 7;
-    repeated string strings = 8;
-    optional bool b = 10;
-    repeated bool bools = 11;
-    optional int32 block_idx = 12;
-    optional int64 l = 13;
-  };
-
-  message Var {
-    required string parameter = 1;
-    repeated string arguments = 2;
-  };
-
-  required string type = 3;
-  repeated Var inputs = 1;
-  repeated Var outputs = 2;
-  repeated Attr attrs = 4;
-  optional bool is_target = 5 [ default = false ];
-};
-
-// OpProto describes a C++ framework::OperatorBase derived class.
-message OpProto {
-
-  // VarProto describes the C++ type framework::Variable.
-  message Var {
-    required string name = 1;
-    required string comment = 2;
-
-    optional bool duplicable = 3 [ default = false ];
-    optional bool intermediate = 4 [ default = false ];
-    optional bool dispensable = 5 [ default = false ];
-  }
-
-  // AttrProto describes the C++ type Attribute.
-  message Attr {
-    required string name = 1;
-    required AttrType type = 2;
-    required string comment = 3;
-    // If that attribute is generated, it means the Paddle third
-    // language binding has responsibility to fill that
-    // attribute. End-User should not set that attribute.
-    optional bool generated = 4 [ default = false ];
-  }
-
-  required string type = 1;
-  repeated Var inputs = 2;
-  repeated Var outputs = 3;
-  repeated Attr attrs = 4;
-  required string comment = 5;
-}
-
-message VarType {
-  enum Type {
-    // Pod Types
-    BOOL = 0;
-    INT16 = 1;
-    INT32 = 2;
-    INT64 = 3;
-    FP16 = 4;
-    FP32 = 5;
-    FP64 = 6;
-
-    // Other types that may need additional descriptions
-    LOD_TENSOR = 7;
-    SELECTED_ROWS = 8;
-    FEED_MINIBATCH = 9;
-    FETCH_LIST = 10;
-    STEP_SCOPES = 11;
-    LOD_RANK_TABLE = 12;
-    LOD_TENSOR_ARRAY = 13;
-    PLACE_LIST = 14;
-    READER = 15;
-    CHANNEL = 16;
-    // Any runtime decided variable type is raw
-    // raw variables should manage their own allocations
-    // in operators like nccl_op
-    RAW = 17;
-    TUPLE = 18;
-  }
-
-  required Type type = 1;
-
-  message TensorDesc {
-    // Should only be PODType. Is enforced in C++
-    required Type data_type = 1;
-    repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  }
-  optional TensorDesc selected_rows = 2;
-
-  message LoDTensorDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorDesc lod_tensor = 3;
-
-  message LoDTensorArrayDesc {
-    required TensorDesc tensor = 1;
-    optional int32 lod_level = 2 [ default = 0 ];
-  }
-  optional LoDTensorArrayDesc tensor_array = 4;
-
-  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
-  optional ReaderDesc reader = 5;
-
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
-  message Tuple { repeated Type element_type = 1; }
-  optional Tuple tuple = 7;
-}
-
-message VarDesc {
-  required string name = 1;
-  required VarType type = 2;
-  optional bool persistable = 3 [ default = false ];
-}
-
-message BlockDesc {
-  required int32 idx = 1;
-  required int32 parent_idx = 2;
-  repeated VarDesc vars = 3;
-  repeated OpDesc ops = 4;
-  optional int32 forward_block_idx = 5 [ default = -1 ];
-}
-
-// Please refer to
-// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
-// for more details.
-// TODO(panyx0718): A model can have multiple programs. Need a
-// way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/mobile/tools/python/modeltools/core/framework_pb2.py b/mobile/tools/python/modeltools/core/framework_pb2.py
deleted file mode 100644
index 3a43deebc91d42e9eb38cf9940020238041d81da..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/core/framework_pb2.py
+++ /dev/null
@@ -1,1141 +0,0 @@
-# Generated by the protocol buffer compiler.  DO NOT EDIT!
-# source: framework.proto
-
-import sys
-_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
-from google.protobuf.internal import enum_type_wrapper
-from google.protobuf import descriptor as _descriptor
-from google.protobuf import message as _message
-from google.protobuf import reflection as _reflection
-from google.protobuf import symbol_database as _symbol_database
-from google.protobuf import descriptor_pb2
-# @@protoc_insertion_point(imports)
-
-_sym_db = _symbol_database.Default()
-
-
-
-
-DESCRIPTOR = _descriptor.FileDescriptor(
-  name='framework.proto',
-  package='paddle_mobile.framework.proto',
-  syntax='proto2',
-  serialized_pb=_b('\n\x0f\x66ramework.proto\x12\x1dpaddle_mobile.framework.proto\"\xe5\x03\n\x06OpDesc\x12\x0c\n\x04type\x18\x03 \x02(\t\x12\x39\n\x06inputs\x18\x01 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12:\n\x07outputs\x18\x02 \x03(\x0b\x32).paddle_mobile.framework.proto.OpDesc.Var\x12\x39\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpDesc.Attr\x12\x18\n\tis_target\x18\x05 \x01(\x08:\x05\x66\x61lse\x1a\xd3\x01\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\t\n\x01i\x18\x03 \x01(\x05\x12\t\n\x01\x66\x18\x04 \x01(\x02\x12\t\n\x01s\x18\x05 \x01(\t\x12\x0c\n\x04ints\x18\x06 \x03(\x05\x12\x0e\n\x06\x66loats\x18\x07 \x03(\x02\x12\x0f\n\x07strings\x18\x08 \x03(\t\x12\t\n\x01\x62\x18\n \x01(\x08\x12\r\n\x05\x62ools\x18\x0b \x03(\x08\x12\x11\n\tblock_idx\x18\x0c \x01(\x05\x12\t\n\x01l\x18\r \x01(\x03\x1a+\n\x03Var\x12\x11\n\tparameter\x18\x01 \x02(\t\x12\x11\n\targuments\x18\x02 \x03(\t\"\xcf\x03\n\x07OpProto\x12\x0c\n\x04type\x18\x01 \x02(\t\x12:\n\x06inputs\x18\x02 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12;\n\x07outputs\x18\x03 \x03(\x0b\x32*.paddle_mobile.framework.proto.OpProto.Var\x12:\n\x05\x61ttrs\x18\x04 \x03(\x0b\x32+.paddle_mobile.framework.proto.OpProto.Attr\x12\x0f\n\x07\x63omment\x18\x05 \x02(\t\x1ax\n\x03Var\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x0f\n\x07\x63omment\x18\x02 \x02(\t\x12\x19\n\nduplicable\x18\x03 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0cintermediate\x18\x04 \x01(\x08:\x05\x66\x61lse\x12\x1a\n\x0b\x64ispensable\x18\x05 \x01(\x08:\x05\x66\x61lse\x1av\n\x04\x41ttr\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x35\n\x04type\x18\x02 \x02(\x0e\x32\'.paddle_mobile.framework.proto.AttrType\x12\x0f\n\x07\x63omment\x18\x03 \x02(\t\x12\x18\n\tgenerated\x18\x04 \x01(\x08:\x05\x66\x61lse\"\xb9\n\n\x07VarType\x12\x39\n\x04type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12H\n\rselected_rows\x18\x02 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12H\n\nlod_tensor\x18\x03 \x01(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x12O\n\x0ctensor_array\x18\x04 \x01(\x0b\x32\x39.paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc\x12\x41\n\x06reader\x18\x05 \x01(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.ReaderDesc\x12\x43\n\x07\x63hannel\x18\x06 \x01(\x0b\x32\x32.paddle_mobile.framework.proto.VarType.ChannelDesc\x12;\n\x05tuple\x18\x07 \x01(\x0b\x32,.paddle_mobile.framework.proto.VarType.Tuple\x1aZ\n\nTensorDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x1ah\n\rLoDTensorDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1am\n\x12LoDTensorArrayDesc\x12\x41\n\x06tensor\x18\x01 \x02(\x0b\x32\x31.paddle_mobile.framework.proto.VarType.TensorDesc\x12\x14\n\tlod_level\x18\x02 \x01(\x05:\x01\x30\x1aV\n\nReaderDesc\x12H\n\nlod_tensor\x18\x01 \x03(\x0b\x32\x34.paddle_mobile.framework.proto.VarType.LoDTensorDesc\x1a_\n\x0b\x43hannelDesc\x12>\n\tdata_type\x18\x01 \x02(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\x12\x10\n\x08\x63\x61pacity\x18\x02 \x02(\x03\x1aJ\n\x05Tuple\x12\x41\n\x0c\x65lement_type\x18\x01 \x03(\x0e\x32+.paddle_mobile.framework.proto.VarType.Type\"\x8e\x02\n\x04Type\x12\x08\n\x04\x42OOL\x10\x00\x12\t\n\x05INT16\x10\x01\x12\t\n\x05INT32\x10\x02\x12\t\n\x05INT64\x10\x03\x12\x08\n\x04\x46P16\x10\x04\x12\x08\n\x04\x46P32\x10\x05\x12\x08\n\x04\x46P64\x10\x06\x12\x0e\n\nLOD_TENSOR\x10\x07\x12\x11\n\rSELECTED_ROWS\x10\x08\x12\x12\n\x0e\x46\x45\x45\x44_MINIBATCH\x10\t\x12\x0e\n\nFETCH_LIST\x10\n\x12\x0f\n\x0bSTEP_SCOPES\x10\x0b\x12\x12\n\x0eLOD_RANK_TABLE\x10\x0c\x12\x14\n\x10LOD_TENSOR_ARRAY\x10\r\x12\x0e\n\nPLACE_LIST\x10\x0e\x12\n\n\x06READER\x10\x0f\x12\x0b\n\x07\x43HANNEL\x10\x10\x12\x07\n\x03RAW\x10\x11\x12\t\n\x05TUPLE\x10\x12\"i\n\x07VarDesc\x12\x0c\n\x04name\x18\x01 \x02(\t\x12\x34\n\x04type\x18\x02 \x02(\x0b\x32&.paddle_mobile.framework.proto.VarType\x12\x1a\n\x0bpersistable\x18\x03 \x01(\x08:\x05\x66\x61lse\"\xb5\x01\n\tBlockDesc\x12\x0b\n\x03idx\x18\x01 \x02(\x05\x12\x12\n\nparent_idx\x18\x02 \x02(\x05\x12\x34\n\x04vars\x18\x03 \x03(\x0b\x32&.paddle_mobile.framework.proto.VarDesc\x12\x32\n\x03ops\x18\x04 \x03(\x0b\x32%.paddle_mobile.framework.proto.OpDesc\x12\x1d\n\x11\x66orward_block_idx\x18\x05 \x01(\x05:\x02-1\"G\n\x0bProgramDesc\x12\x38\n\x06\x62locks\x18\x01 \x03(\x0b\x32(.paddle_mobile.framework.proto.BlockDesc*}\n\x08\x41ttrType\x12\x07\n\x03INT\x10\x00\x12\t\n\x05\x46LOAT\x10\x01\x12\n\n\x06STRING\x10\x02\x12\x08\n\x04INTS\x10\x03\x12\n\n\x06\x46LOATS\x10\x04\x12\x0b\n\x07STRINGS\x10\x05\x12\x0b\n\x07\x42OOLEAN\x10\x06\x12\x0c\n\x08\x42OOLEANS\x10\x07\x12\t\n\x05\x42LOCK\x10\x08\x12\x08\n\x04LONG\x10\tB\x02H\x03')
-)
-_sym_db.RegisterFileDescriptor(DESCRIPTOR)
-
-_ATTRTYPE = _descriptor.EnumDescriptor(
-  name='AttrType',
-  full_name='paddle_mobile.framework.proto.AttrType',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='INT', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOAT', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STRING', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INTS', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FLOATS', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STRINGS', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BOOLEAN', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BOOLEANS', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='BLOCK', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LONG', index=9, number=9,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=2708,
-  serialized_end=2833,
-)
-_sym_db.RegisterEnumDescriptor(_ATTRTYPE)
-
-AttrType = enum_type_wrapper.EnumTypeWrapper(_ATTRTYPE)
-INT = 0
-FLOAT = 1
-STRING = 2
-INTS = 3
-FLOATS = 4
-STRINGS = 5
-BOOLEAN = 6
-BOOLEANS = 7
-BLOCK = 8
-LONG = 9
-
-
-_VARTYPE_TYPE = _descriptor.EnumDescriptor(
-  name='Type',
-  full_name='paddle_mobile.framework.proto.VarType.Type',
-  filename=None,
-  file=DESCRIPTOR,
-  values=[
-    _descriptor.EnumValueDescriptor(
-      name='BOOL', index=0, number=0,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT16', index=1, number=1,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT32', index=2, number=2,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='INT64', index=3, number=3,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP16', index=4, number=4,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP32', index=5, number=5,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FP64', index=6, number=6,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_TENSOR', index=7, number=7,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='SELECTED_ROWS', index=8, number=8,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FEED_MINIBATCH', index=9, number=9,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='FETCH_LIST', index=10, number=10,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='STEP_SCOPES', index=11, number=11,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_RANK_TABLE', index=12, number=12,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='LOD_TENSOR_ARRAY', index=13, number=13,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='PLACE_LIST', index=14, number=14,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='READER', index=15, number=15,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='CHANNEL', index=16, number=16,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='RAW', index=17, number=17,
-      options=None,
-      type=None),
-    _descriptor.EnumValueDescriptor(
-      name='TUPLE', index=18, number=18,
-      options=None,
-      type=None),
-  ],
-  containing_type=None,
-  options=None,
-  serialized_start=2072,
-  serialized_end=2342,
-)
-_sym_db.RegisterEnumDescriptor(_VARTYPE_TYPE)
-
-
-_OPDESC_ATTR = _descriptor.Descriptor(
-  name='Attr',
-  full_name='paddle_mobile.framework.proto.OpDesc.Attr',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpDesc.Attr.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpDesc.Attr.type', index=1,
-      number=2, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='i', full_name='paddle_mobile.framework.proto.OpDesc.Attr.i', index=2,
-      number=3, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='f', full_name='paddle_mobile.framework.proto.OpDesc.Attr.f', index=3,
-      number=4, type=2, cpp_type=6, label=1,
-      has_default_value=False, default_value=float(0),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='s', full_name='paddle_mobile.framework.proto.OpDesc.Attr.s', index=4,
-      number=5, type=9, cpp_type=9, label=1,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ints', full_name='paddle_mobile.framework.proto.OpDesc.Attr.ints', index=5,
-      number=6, type=5, cpp_type=1, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='floats', full_name='paddle_mobile.framework.proto.OpDesc.Attr.floats', index=6,
-      number=7, type=2, cpp_type=6, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='strings', full_name='paddle_mobile.framework.proto.OpDesc.Attr.strings', index=7,
-      number=8, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='b', full_name='paddle_mobile.framework.proto.OpDesc.Attr.b', index=8,
-      number=10, type=8, cpp_type=7, label=1,
-      has_default_value=False, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='bools', full_name='paddle_mobile.framework.proto.OpDesc.Attr.bools', index=9,
-      number=11, type=8, cpp_type=7, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='block_idx', full_name='paddle_mobile.framework.proto.OpDesc.Attr.block_idx', index=10,
-      number=12, type=5, cpp_type=1, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='l', full_name='paddle_mobile.framework.proto.OpDesc.Attr.l', index=11,
-      number=13, type=3, cpp_type=2, label=1,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=280,
-  serialized_end=491,
-)
-
-_OPDESC_VAR = _descriptor.Descriptor(
-  name='Var',
-  full_name='paddle_mobile.framework.proto.OpDesc.Var',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='parameter', full_name='paddle_mobile.framework.proto.OpDesc.Var.parameter', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='arguments', full_name='paddle_mobile.framework.proto.OpDesc.Var.arguments', index=1,
-      number=2, type=9, cpp_type=9, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=493,
-  serialized_end=536,
-)
-
-_OPDESC = _descriptor.Descriptor(
-  name='OpDesc',
-  full_name='paddle_mobile.framework.proto.OpDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpDesc.type', index=0,
-      number=3, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='inputs', full_name='paddle_mobile.framework.proto.OpDesc.inputs', index=1,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='outputs', full_name='paddle_mobile.framework.proto.OpDesc.outputs', index=2,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='attrs', full_name='paddle_mobile.framework.proto.OpDesc.attrs', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='is_target', full_name='paddle_mobile.framework.proto.OpDesc.is_target', index=4,
-      number=5, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_OPDESC_ATTR, _OPDESC_VAR, ],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=51,
-  serialized_end=536,
-)
-
-
-_OPPROTO_VAR = _descriptor.Descriptor(
-  name='Var',
-  full_name='paddle_mobile.framework.proto.OpProto.Var',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpProto.Var.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Var.comment', index=1,
-      number=2, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='duplicable', full_name='paddle_mobile.framework.proto.OpProto.Var.duplicable', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='intermediate', full_name='paddle_mobile.framework.proto.OpProto.Var.intermediate', index=3,
-      number=4, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dispensable', full_name='paddle_mobile.framework.proto.OpProto.Var.dispensable', index=4,
-      number=5, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=762,
-  serialized_end=882,
-)
-
-_OPPROTO_ATTR = _descriptor.Descriptor(
-  name='Attr',
-  full_name='paddle_mobile.framework.proto.OpProto.Attr',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.OpProto.Attr.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpProto.Attr.type', index=1,
-      number=2, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.Attr.comment', index=2,
-      number=3, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='generated', full_name='paddle_mobile.framework.proto.OpProto.Attr.generated', index=3,
-      number=4, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=884,
-  serialized_end=1002,
-)
-
-_OPPROTO = _descriptor.Descriptor(
-  name='OpProto',
-  full_name='paddle_mobile.framework.proto.OpProto',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.OpProto.type', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='inputs', full_name='paddle_mobile.framework.proto.OpProto.inputs', index=1,
-      number=2, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='outputs', full_name='paddle_mobile.framework.proto.OpProto.outputs', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='attrs', full_name='paddle_mobile.framework.proto.OpProto.attrs', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='comment', full_name='paddle_mobile.framework.proto.OpProto.comment', index=4,
-      number=5, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_OPPROTO_VAR, _OPPROTO_ATTR, ],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=539,
-  serialized_end=1002,
-)
-
-
-_VARTYPE_TENSORDESC = _descriptor.Descriptor(
-  name='TensorDesc',
-  full_name='paddle_mobile.framework.proto.VarType.TensorDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.data_type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='dims', full_name='paddle_mobile.framework.proto.VarType.TensorDesc.dims', index=1,
-      number=2, type=3, cpp_type=2, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1501,
-  serialized_end=1591,
-)
-
-_VARTYPE_LODTENSORDESC = _descriptor.Descriptor(
-  name='LoDTensorDesc',
-  full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.tensor', index=0,
-      number=1, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorDesc.lod_level', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1593,
-  serialized_end=1697,
-)
-
-_VARTYPE_LODTENSORARRAYDESC = _descriptor.Descriptor(
-  name='LoDTensorArrayDesc',
-  full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='tensor', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.tensor', index=0,
-      number=1, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_level', full_name='paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc.lod_level', index=1,
-      number=2, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1699,
-  serialized_end=1808,
-)
-
-_VARTYPE_READERDESC = _descriptor.Descriptor(
-  name='ReaderDesc',
-  full_name='paddle_mobile.framework.proto.VarType.ReaderDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.ReaderDesc.lod_tensor', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1810,
-  serialized_end=1896,
-)
-
-_VARTYPE_CHANNELDESC = _descriptor.Descriptor(
-  name='ChannelDesc',
-  full_name='paddle_mobile.framework.proto.VarType.ChannelDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='data_type', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.data_type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='capacity', full_name='paddle_mobile.framework.proto.VarType.ChannelDesc.capacity', index=1,
-      number=2, type=3, cpp_type=2, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1898,
-  serialized_end=1993,
-)
-
-_VARTYPE_TUPLE = _descriptor.Descriptor(
-  name='Tuple',
-  full_name='paddle_mobile.framework.proto.VarType.Tuple',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='element_type', full_name='paddle_mobile.framework.proto.VarType.Tuple.element_type', index=0,
-      number=1, type=14, cpp_type=8, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1995,
-  serialized_end=2069,
-)
-
-_VARTYPE = _descriptor.Descriptor(
-  name='VarType',
-  full_name='paddle_mobile.framework.proto.VarType',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.VarType.type', index=0,
-      number=1, type=14, cpp_type=8, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='selected_rows', full_name='paddle_mobile.framework.proto.VarType.selected_rows', index=1,
-      number=2, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='lod_tensor', full_name='paddle_mobile.framework.proto.VarType.lod_tensor', index=2,
-      number=3, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tensor_array', full_name='paddle_mobile.framework.proto.VarType.tensor_array', index=3,
-      number=4, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='reader', full_name='paddle_mobile.framework.proto.VarType.reader', index=4,
-      number=5, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='channel', full_name='paddle_mobile.framework.proto.VarType.channel', index=5,
-      number=6, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='tuple', full_name='paddle_mobile.framework.proto.VarType.tuple', index=6,
-      number=7, type=11, cpp_type=10, label=1,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[_VARTYPE_TENSORDESC, _VARTYPE_LODTENSORDESC, _VARTYPE_LODTENSORARRAYDESC, _VARTYPE_READERDESC, _VARTYPE_CHANNELDESC, _VARTYPE_TUPLE, ],
-  enum_types=[
-    _VARTYPE_TYPE,
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=1005,
-  serialized_end=2342,
-)
-
-
-_VARDESC = _descriptor.Descriptor(
-  name='VarDesc',
-  full_name='paddle_mobile.framework.proto.VarDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='name', full_name='paddle_mobile.framework.proto.VarDesc.name', index=0,
-      number=1, type=9, cpp_type=9, label=2,
-      has_default_value=False, default_value=_b("").decode('utf-8'),
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='type', full_name='paddle_mobile.framework.proto.VarDesc.type', index=1,
-      number=2, type=11, cpp_type=10, label=2,
-      has_default_value=False, default_value=None,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='persistable', full_name='paddle_mobile.framework.proto.VarDesc.persistable', index=2,
-      number=3, type=8, cpp_type=7, label=1,
-      has_default_value=True, default_value=False,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2344,
-  serialized_end=2449,
-)
-
-
-_BLOCKDESC = _descriptor.Descriptor(
-  name='BlockDesc',
-  full_name='paddle_mobile.framework.proto.BlockDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='idx', full_name='paddle_mobile.framework.proto.BlockDesc.idx', index=0,
-      number=1, type=5, cpp_type=1, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='parent_idx', full_name='paddle_mobile.framework.proto.BlockDesc.parent_idx', index=1,
-      number=2, type=5, cpp_type=1, label=2,
-      has_default_value=False, default_value=0,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='vars', full_name='paddle_mobile.framework.proto.BlockDesc.vars', index=2,
-      number=3, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='ops', full_name='paddle_mobile.framework.proto.BlockDesc.ops', index=3,
-      number=4, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-    _descriptor.FieldDescriptor(
-      name='forward_block_idx', full_name='paddle_mobile.framework.proto.BlockDesc.forward_block_idx', index=4,
-      number=5, type=5, cpp_type=1, label=1,
-      has_default_value=True, default_value=-1,
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2452,
-  serialized_end=2633,
-)
-
-
-_PROGRAMDESC = _descriptor.Descriptor(
-  name='ProgramDesc',
-  full_name='paddle_mobile.framework.proto.ProgramDesc',
-  filename=None,
-  file=DESCRIPTOR,
-  containing_type=None,
-  fields=[
-    _descriptor.FieldDescriptor(
-      name='blocks', full_name='paddle_mobile.framework.proto.ProgramDesc.blocks', index=0,
-      number=1, type=11, cpp_type=10, label=3,
-      has_default_value=False, default_value=[],
-      message_type=None, enum_type=None, containing_type=None,
-      is_extension=False, extension_scope=None,
-      options=None),
-  ],
-  extensions=[
-  ],
-  nested_types=[],
-  enum_types=[
-  ],
-  options=None,
-  is_extendable=False,
-  syntax='proto2',
-  extension_ranges=[],
-  oneofs=[
-  ],
-  serialized_start=2635,
-  serialized_end=2706,
-)
-
-_OPDESC_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
-_OPDESC_ATTR.containing_type = _OPDESC
-_OPDESC_VAR.containing_type = _OPDESC
-_OPDESC.fields_by_name['inputs'].message_type = _OPDESC_VAR
-_OPDESC.fields_by_name['outputs'].message_type = _OPDESC_VAR
-_OPDESC.fields_by_name['attrs'].message_type = _OPDESC_ATTR
-_OPPROTO_VAR.containing_type = _OPPROTO
-_OPPROTO_ATTR.fields_by_name['type'].enum_type = _ATTRTYPE
-_OPPROTO_ATTR.containing_type = _OPPROTO
-_OPPROTO.fields_by_name['inputs'].message_type = _OPPROTO_VAR
-_OPPROTO.fields_by_name['outputs'].message_type = _OPPROTO_VAR
-_OPPROTO.fields_by_name['attrs'].message_type = _OPPROTO_ATTR
-_VARTYPE_TENSORDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_TENSORDESC.containing_type = _VARTYPE
-_VARTYPE_LODTENSORDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE_LODTENSORDESC.containing_type = _VARTYPE
-_VARTYPE_LODTENSORARRAYDESC.fields_by_name['tensor'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE_LODTENSORARRAYDESC.containing_type = _VARTYPE
-_VARTYPE_READERDESC.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
-_VARTYPE_READERDESC.containing_type = _VARTYPE
-_VARTYPE_CHANNELDESC.fields_by_name['data_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_CHANNELDESC.containing_type = _VARTYPE
-_VARTYPE_TUPLE.fields_by_name['element_type'].enum_type = _VARTYPE_TYPE
-_VARTYPE_TUPLE.containing_type = _VARTYPE
-_VARTYPE.fields_by_name['type'].enum_type = _VARTYPE_TYPE
-_VARTYPE.fields_by_name['selected_rows'].message_type = _VARTYPE_TENSORDESC
-_VARTYPE.fields_by_name['lod_tensor'].message_type = _VARTYPE_LODTENSORDESC
-_VARTYPE.fields_by_name['tensor_array'].message_type = _VARTYPE_LODTENSORARRAYDESC
-_VARTYPE.fields_by_name['reader'].message_type = _VARTYPE_READERDESC
-_VARTYPE.fields_by_name['channel'].message_type = _VARTYPE_CHANNELDESC
-_VARTYPE.fields_by_name['tuple'].message_type = _VARTYPE_TUPLE
-_VARTYPE_TYPE.containing_type = _VARTYPE
-_VARDESC.fields_by_name['type'].message_type = _VARTYPE
-_BLOCKDESC.fields_by_name['vars'].message_type = _VARDESC
-_BLOCKDESC.fields_by_name['ops'].message_type = _OPDESC
-_PROGRAMDESC.fields_by_name['blocks'].message_type = _BLOCKDESC
-DESCRIPTOR.message_types_by_name['OpDesc'] = _OPDESC
-DESCRIPTOR.message_types_by_name['OpProto'] = _OPPROTO
-DESCRIPTOR.message_types_by_name['VarType'] = _VARTYPE
-DESCRIPTOR.message_types_by_name['VarDesc'] = _VARDESC
-DESCRIPTOR.message_types_by_name['BlockDesc'] = _BLOCKDESC
-DESCRIPTOR.message_types_by_name['ProgramDesc'] = _PROGRAMDESC
-DESCRIPTOR.enum_types_by_name['AttrType'] = _ATTRTYPE
-
-OpDesc = _reflection.GeneratedProtocolMessageType('OpDesc', (_message.Message,), dict(
-
-  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
-    DESCRIPTOR = _OPDESC_ATTR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Attr)
-    ))
-  ,
-
-  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
-    DESCRIPTOR = _OPDESC_VAR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc.Var)
-    ))
-  ,
-  DESCRIPTOR = _OPDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpDesc)
-  ))
-_sym_db.RegisterMessage(OpDesc)
-_sym_db.RegisterMessage(OpDesc.Attr)
-_sym_db.RegisterMessage(OpDesc.Var)
-
-OpProto = _reflection.GeneratedProtocolMessageType('OpProto', (_message.Message,), dict(
-
-  Var = _reflection.GeneratedProtocolMessageType('Var', (_message.Message,), dict(
-    DESCRIPTOR = _OPPROTO_VAR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Var)
-    ))
-  ,
-
-  Attr = _reflection.GeneratedProtocolMessageType('Attr', (_message.Message,), dict(
-    DESCRIPTOR = _OPPROTO_ATTR,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto.Attr)
-    ))
-  ,
-  DESCRIPTOR = _OPPROTO,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.OpProto)
-  ))
-_sym_db.RegisterMessage(OpProto)
-_sym_db.RegisterMessage(OpProto.Var)
-_sym_db.RegisterMessage(OpProto.Attr)
-
-VarType = _reflection.GeneratedProtocolMessageType('VarType', (_message.Message,), dict(
-
-  TensorDesc = _reflection.GeneratedProtocolMessageType('TensorDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_TENSORDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.TensorDesc)
-    ))
-  ,
-
-  LoDTensorDesc = _reflection.GeneratedProtocolMessageType('LoDTensorDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_LODTENSORDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorDesc)
-    ))
-  ,
-
-  LoDTensorArrayDesc = _reflection.GeneratedProtocolMessageType('LoDTensorArrayDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_LODTENSORARRAYDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc)
-    ))
-  ,
-
-  ReaderDesc = _reflection.GeneratedProtocolMessageType('ReaderDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_READERDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ReaderDesc)
-    ))
-  ,
-
-  ChannelDesc = _reflection.GeneratedProtocolMessageType('ChannelDesc', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_CHANNELDESC,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.ChannelDesc)
-    ))
-  ,
-
-  Tuple = _reflection.GeneratedProtocolMessageType('Tuple', (_message.Message,), dict(
-    DESCRIPTOR = _VARTYPE_TUPLE,
-    __module__ = 'framework_pb2'
-    # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType.Tuple)
-    ))
-  ,
-  DESCRIPTOR = _VARTYPE,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarType)
-  ))
-_sym_db.RegisterMessage(VarType)
-_sym_db.RegisterMessage(VarType.TensorDesc)
-_sym_db.RegisterMessage(VarType.LoDTensorDesc)
-_sym_db.RegisterMessage(VarType.LoDTensorArrayDesc)
-_sym_db.RegisterMessage(VarType.ReaderDesc)
-_sym_db.RegisterMessage(VarType.ChannelDesc)
-_sym_db.RegisterMessage(VarType.Tuple)
-
-VarDesc = _reflection.GeneratedProtocolMessageType('VarDesc', (_message.Message,), dict(
-  DESCRIPTOR = _VARDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.VarDesc)
-  ))
-_sym_db.RegisterMessage(VarDesc)
-
-BlockDesc = _reflection.GeneratedProtocolMessageType('BlockDesc', (_message.Message,), dict(
-  DESCRIPTOR = _BLOCKDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.BlockDesc)
-  ))
-_sym_db.RegisterMessage(BlockDesc)
-
-ProgramDesc = _reflection.GeneratedProtocolMessageType('ProgramDesc', (_message.Message,), dict(
-  DESCRIPTOR = _PROGRAMDESC,
-  __module__ = 'framework_pb2'
-  # @@protoc_insertion_point(class_scope:paddle_mobile.framework.proto.ProgramDesc)
-  ))
-_sym_db.RegisterMessage(ProgramDesc)
-
-
-DESCRIPTOR.has_options = True
-DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003'))
-# @@protoc_insertion_point(module_scope)
diff --git a/mobile/tools/python/modeltools/core/op_types.py b/mobile/tools/python/modeltools/core/op_types.py
deleted file mode 100644
index 550f87339c9a048a3732daa7707dd6427965029a..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/core/op_types.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# coding=utf-8
-
-# mdl layers
-layer_mdl_conv = 'ConvolutionLayer'
-layer_mdl_deepwise_conv = 'DepthwiseConvolutionLayer'
-layer_mdl_relu = 'ReluLayer'
-layer_mdl_pointwise_add = 'PointwiseConvolutionLayer'
-layer_mdl_pooling = 'PoolingLayer'
-layer_mdl_softmax = 'SoftmaxLayer'
-
-# fluid ops
-op_fluid_fusion_conv_add = 'fusion_conv_add'
-op_fluid_relu = 'relu'
-op_fluid_pooling = 'pool2d'
-op_fluid_softmax = 'softmax'
-
-# dict mdk layer ---  fluid op
-mdl2fluid_op_layer_dict = {
-    layer_mdl_conv: op_fluid_fusion_conv_add,
-    layer_mdl_deepwise_conv: op_fluid_fusion_conv_add,
-    layer_mdl_relu: op_fluid_relu,
-    layer_mdl_pointwise_add: op_fluid_fusion_conv_add,
-    layer_mdl_pooling: op_fluid_pooling,
-    layer_mdl_softmax: op_fluid_softmax
-}
-
-mdl_outputs_key = "outputs"
-mdl_inputs_key = "inputs"
-mdl_weight_key = "weight"
-mdl_attrs_key = "params"
-
-# dict of mdl-input _out param  to fluid input out attrs
-fusion_conv_add_dict = {
-    mdl_inputs_key: 'Input',
-    mdl_outputs_key: 'Out',
-    mdl_weight_key: ('Filter', 'Y'),
-    mdl_attrs_key: (
-        # 'workspace_size_MB', 'use_mkldnn', 'use_cudnn', 'data_format','dilations',
-        # dilations =  [1,1]
-        'groups', 'paddings', 'strides'
-        # 'axis'
-    )
-}
-
-relu_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    # mdl_weight_key: ()
-
-}
-
-pool2d_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    # mdl_weight_key: (),
-    mdl_attrs_key: ('pooling_type', 'global_pooling')
-
-}
-
-softmax_dict = {
-    mdl_inputs_key: 'X',
-    mdl_outputs_key: 'Out',
-    mdl_weight_key: (),
-    mdl_attrs_key: ()
-}
-# mdl layers  ---  fluid ops
-op_io_dict = {
-    'fusion_conv_add': fusion_conv_add_dict,
-    'relu': relu_dict,
-    'pool2d': pool2d_dict,
-    'softmax': softmax_dict
-}
-
-# fluid attr key  ---  mdl params key
-fusion_conv_add_attrs_dict = {
-    'paddings': 'pad',
-    'strides': 'stride',
-    'groups': 'group'
-}
-
-# fluid attr key  ---  mdl params key
-pool2d_attrs_dict = {
-    'global_pooling': 'global_pooling',
-    'pooling_type': 'type'
-}
-
-
-# fluid attr key  ---  mdl params key
-fluid_attrs_type_dict = {
-    'paddings': 0,
-    'strides': 6,
-    'groups': 6
-}
diff --git a/mobile/tools/python/modeltools/mobilenet/__init__.py b/mobile/tools/python/modeltools/mobilenet/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py b/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
deleted file mode 100644
index ca1e1f7f4d83cf219e1e74603bb23a15c34cfb36..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/mobilenet/converter_mobilenet.py
+++ /dev/null
@@ -1,509 +0,0 @@
-# coding=utf-8
-import json
-import os
-
-from core import framework_pb2 as framework_pb2, op_types as types
-from mobilenet.swicher import Swichter
-import shutil
-
-
-def load_mdl(mdl_json_path):
-    # print('mdl json path : ' + mdl_json_path)
-    with open(mdl_json_path, 'r') as f:
-        return json.load(f)
-
-
-def create_if_not_exit(target_dir):
-    if os.path.exists(target_dir):
-        shutil.rmtree(target_dir)
-    os.makedirs(target_dir, 0777)
-
-
-class Converter:
-    'convert mdlmodel to fluidmodel'
-
-    def __init__(self, base_dir, mdl_json_path):
-        print 'base_dir:  ' + base_dir
-        self.mdl_json_path = base_dir + mdl_json_path
-        self.base_dir = base_dir
-        print mdl_json_path
-        self.source_weights_dir = self.base_dir + 'datas/sourcemodels/source_weights/'
-        self.target_weight_dir = self.base_dir + 'datas/target/target_weights/'
-
-        create_if_not_exit(self.target_weight_dir)
-
-        self.mdl_json = load_mdl(self.mdl_json_path)
-        self.program_desc = framework_pb2.ProgramDesc()
-        self.weight_list_ = []
-        self.deepwise_weight_list_ = []
-        # print(json_dick)
-        # layers = (json_dick['layer'])
-        # for layer in layers:
-        #     print(layer)
-
-    def convert(self):
-        print 'convert begin.....'
-        # add block_desc
-        block_desc = self.program_desc.blocks.add()
-        block_desc.idx = 0
-        block_desc.parent_idx = -1
-        self.package_ops(block_desc)
-        self.package_vars(block_desc)
-        print 'blocks: '
-        print self.program_desc.blocks
-        print 'convert end.....'
-        desc_serialize_to_string = self.program_desc.SerializeToString()
-
-        outputmodel_dir = self.base_dir + 'datas/target/mobilenet_classfication/'
-        if os.path.exists(outputmodel_dir):
-            shutil.rmtree(outputmodel_dir)
-        os.makedirs(outputmodel_dir, 0777)
-
-        if os.path.exists(outputmodel_dir):
-            shutil.rmtree(outputmodel_dir)
-        # create_if_not_exit(outputmodel_dir)
-
-        shutil.copytree(self.target_weight_dir, outputmodel_dir)
-
-        f = open(outputmodel_dir + "__model__", "wb")
-        f.write(desc_serialize_to_string)
-        f.close()
-
-    def package_ops(self, block_desc):
-
-        self.add_op_feed(block_desc)
-
-        # add ops with layer
-        if 'layer' in self.mdl_json:
-
-            layers_ = self.mdl_json['layer']
-            for layer in layers_:
-
-                if layer['type'] == 'SoftmaxLayer':
-                    pass
-                else:
-                    desc_ops_add = block_desc.ops.add()
-
-                    # print layer
-                    # for i in layer:
-                    #     print i
-                    if 'name' in layer:
-                        l_name = layer['name']
-                    if 'type' in layer:
-                        self.package_ops_type(desc_ops_add, layer)
-
-                    if 'weight' in layer:
-                        self.package_ops_weight2inputs(desc_ops_add, layer)
-
-                    if 'output' in layer:
-                        self.package_ops_outputs(desc_ops_add, layer)
-
-                    if 'input' in layer:
-                        self.package_ops_inputs(desc_ops_add, layer)
-
-                    self.package_ops_attrs(desc_ops_add, layer)
-
-        self.add_op_fetch(block_desc)
-
-    def add_op_feed(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('feed')
-        desc_ops_add.type = 'feed'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('data')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    def add_op_fetch(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        # todo pick last layer --> op output
-        inputs_add.arguments.append('fc7')
-        desc_ops_add.type = 'fetch'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('fetch')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    @staticmethod
-    def package_ops_attrs(desc_ops_add, layer):
-        # print l_params
-        # print desc_ops_add.type
-        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
-            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
-        elif desc_ops_add.type == types.op_fluid_relu:
-            # fusion_conv_add : attrs
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'use_mkldnn'
-            # boolean
-            attrs_add.type = 6
-            attrs_add.b = 0
-        elif desc_ops_add.type == types.op_fluid_pooling:
-            Converter.pack_pooling_attr(desc_ops_add, layer)
-            pass
-        elif desc_ops_add.type == types.op_fluid_softmax:
-            pass
-
-    @staticmethod
-    def pack_pooling_attr(desc_ops_add, layer):
-        print layer
-        l_params = layer['param']
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'paddings'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(0)
-        attrs_add.ints.append(0)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'strides'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'global_pooling'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = (l_params[types.pool2d_attrs_dict.get('global_pooling')])
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'pooling_type'
-        # 2-->STRING
-        attrs_add.type = 2
-        # 注意这里 avg but mdl is ave
-        attrs_add.s = l_params[types.pool2d_attrs_dict.get('pooling_type')]
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'ceil_mode'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'ksize'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(7)
-        attrs_add.ints.append(7)
-
-    # type: "pool2d"
-    # attrs
-    # {
-    #     name: "use_mkldnn"
-    #     type: BOOLEAN
-    #     b: false
-    # }
-    # attrs
-    # {
-    #     name: "ceil_mode"
-    #     type: BOOLEAN
-    #     b: true
-    # }
-    # attrs
-    # {
-    #     name: "use_cudnn"
-    #     type: BOOLEAN
-    #     b: true
-    # }
-    # attrs
-    # {
-    #     name: "paddings"
-    #     type: INTS
-    #     ints: 0
-    #     ints: 0
-    # }
-    # attrs
-    # {
-    #     name: "strides"
-    #     type: INTS
-    #     ints: 1
-    #     ints: 1
-    # }
-    # attrs
-    # {
-    #     name: "global_pooling"
-    #     type: BOOLEAN
-    #     b: false
-    # }
-    # attrs
-    # {
-    #     name: "data_format"
-    #     type: STRING
-    #     s: "AnyLayout"
-    # }
-    # attrs
-    # {
-    #     name: "ksize"
-    #     type: INTS
-    #     ints: 7
-    #     ints: 7
-    # }
-    # attrs
-    # {
-    #     name: "pooling_type"
-    #     type: STRING
-    #     s: "avg"
-    # }
-    # is_target: false
-
-    @staticmethod
-    def pack_fusion_conv_add_attr(desc_ops_add, layer):
-
-        # fusion_conv_add : attrs
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'workspace_size_MB'
-        # 0-->INT
-        attrs_add.type = 0
-        attrs_add.i = 4096
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'data_format'
-        # 2-->STRING
-        attrs_add.type = 2
-        attrs_add.s = 'AnyLayout'
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'dilations'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'axis'
-        # int
-        attrs_add.type = 0
-        attrs_add.i = 1
-
-        if 'param' in layer:
-            l_params = layer['param']
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'paddings'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-
-            # attrs_add = desc_ops_add.attrs.add()
-            # attrs_add.name = 'paddings'
-            # # ints
-            # attrs_add.type = 3
-            # attrs_add.ints.append(0)
-            # attrs_add.ints.append(0)
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'strides'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-
-            # attrs_add = desc_ops_add.attrs.add()
-            # attrs_add.name = 'strides'
-            # # ints
-            # attrs_add.type = 3
-            # attrs_add.ints.append(6)
-            # attrs_add.ints.append(6)
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'groups'
-            # int
-            attrs_add.type = 0
-            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
-            # attrs_add.i = 1
-
-        #
-        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
-        #     .get(types.mdl_attrs_key)
-        #
-        #
-        #
-        #
-        # # group stride padding
-        # print '----------------------'
-        # for i, val in enumerate(op_attrs_tupl):
-        #     attrs_add = desc_ops_add.attrs.add()
-        #     attr_name = op_attrs_tupl[i]
-        #     print attr_name
-        #     attrs_add.name = attr_name
-        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
-        #     attrs_add.
-        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
-
-        # for p in l_params:
-        #     attrs_add = desc_ops_add.attrs.add()
-
-    @staticmethod
-    def package_ops_inputs(desc_ops_add, layer):
-        l_inputs = layer['input']
-        for i in l_inputs:
-            inputs_add = desc_ops_add.inputs.add()
-            # print i
-            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
-            inputs_add.arguments.append(i)
-
-    @staticmethod
-    def package_ops_outputs(desc_ops_add, layer):
-        l_outputs = layer['output']
-        for o in l_outputs:
-            # print o
-            outputs_add = desc_ops_add.outputs.add()
-            dict = types.op_io_dict.get(desc_ops_add.type)
-            # print 'desc_ops_add.type:  ' + desc_ops_add.type
-            # print dict
-            outputs_add.parameter = dict.get(types.mdl_outputs_key)
-            outputs_add.arguments.append(o)
-
-    def package_ops_weight2inputs(self, desc_ops_add, layer):
-        l_weights = layer['weight']
-        for w in l_weights:
-            self.weight_list_.append(w)
-
-        if layer['type'] == types.layer_mdl_deepwise_conv:
-            # print l_weights[0]
-            self.deepwise_weight_list_.append(l_weights[0])
-
-        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
-        if op_weight_tup is not None:
-            # print len(op_weight_tup)
-            for i, val in enumerate(op_weight_tup):
-                # print i
-                # print val
-                inputs_add = desc_ops_add.inputs.add()
-                inputs_add.parameter = op_weight_tup[i]
-                inputs_add.arguments.append(l_weights[i])
-
-        # for w in l_weights:
-        #     inputs_add = desc_ops_add.inputs.add()
-        #     # print w
-        #     inputs_add.parameter = op_weight_tup[0]
-        #     inputs_add.arguments.append(w)
-
-    @staticmethod
-    def package_ops_type(desc_ops_add, layer):
-        l_type = layer['type']
-        # print l_type
-        # print mdl2fluid_op_layer_dict.get(l_type)
-        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
-
-    def package_vars(self, block_desc):
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'feed'
-        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
-        vars_add.persistable = 1
-        # fetch
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'fetch'
-        vars_add.type.type = 10  # 10 is fetch list
-        vars_add.persistable = 1
-
-        json_matrix_ = self.mdl_json['matrix']
-        # print json_matrix_
-        for j in json_matrix_:
-            vars_add = block_desc.vars.add()
-            vars_add.name = j
-            vars_add.type.type = 7  # 7 is lodtensor
-            # print j
-            tensor = vars_add.type.lod_tensor.tensor
-            tensor.data_type = 5  # 5 is FP32
-
-            # print json_matrix_
-
-            dims_of_matrix = json_matrix_.get(j)
-            # dims_size = len(dims_of_matrix)
-            # print dims_size
-
-            # if dims_size == 4:
-            #     tensor.dims.append(dims_of_matrix[0])  # N
-            #     tensor.dims.append(dims_of_matrix[3])  # C
-            #     tensor.dims.append(dims_of_matrix[1])  # H
-            #     tensor.dims.append(dims_of_matrix[2])  # W
-            # else:
-
-            # issues in mdl model filter swich n and c
-            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
-                print "deep wise issue fit:  " + j
-                tensor.dims.append(dims_of_matrix[1])
-                tensor.dims.append(dims_of_matrix[0])
-                tensor.dims.append(dims_of_matrix[2])
-                tensor.dims.append(dims_of_matrix[3])
-                print tensor.dims
-            else:
-                for dims in dims_of_matrix:
-                    # print dims
-                    tensor.dims.append(dims)
-
-            if j in self.weight_list_:
-                vars_add.persistable = 1
-                dims_size = len(dims_of_matrix)
-                # print dims_size
-                # print 'weight name : ' + j
-                Swichter().copy_add_head(
-                    self.source_weights_dir + j + '.bin',
-                    self.target_weight_dir + j
-                )
-
-                # if dims_size == 4:
-                #     # convert weight from nhwc to nchw
-                #     Swichter().nhwc2nchw_one_slice_add_head(
-                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
-                #         dims_of_matrix[0],
-                #         dims_of_matrix[1],
-                #         dims_of_matrix[2],
-                #         dims_of_matrix[3]
-                #     )
-                # else:
-                #     Swichter().copy_add_head(
-                #         'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                #         'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                #         'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
-                #     )
-            else:
-                vars_add.persistable = 0
-
-
-mdl_path = "datas/sourcemodels/source_profile/mobileNetModel.json"
-base_dir = "/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/"
-converter = Converter(base_dir, mdl_path)
-converter.convert()
diff --git a/mobile/tools/python/modeltools/mobilenet/swicher.py b/mobile/tools/python/modeltools/mobilenet/swicher.py
deleted file mode 100644
index 90bc6d26f600624b14c5912cddfe6e156865d196..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/mobilenet/swicher.py
+++ /dev/null
@@ -1,119 +0,0 @@
-import os
-import shutil
-from array import array
-
-
-class Swichter:
-    def __init__(self):
-        pass
-
-    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(to_file)
-        from_file.close()
-        to_file.close()
-
-    def copy(self, from_file_name, to_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-
-    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        tmp_file = open(tmp_file_name, "wb+")
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(tmp_file)
-        tmp_file.close()
-        from_file.close()
-
-        tmp_file = open(tmp_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        tmp = tmp_file.read()
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(tmp)
-        tmp_file.close()
-        to_file.close()
-
-    def read_head(self, head_file):
-        from_file = open(head_file, "rb")
-        read = from_file.read(24)
-        # print read
-        from_file.close()
-        # print read
-        return read
-
-    def copy_add_head(self, from_file_name, to_file_name):
-
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head(
-            '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/head/head')
-        to_file.write(head)
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-        pass
-
-    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
-        print'padding  = %d' % padding
-        from_file = open(from_file_name, "rb")
-        # print len(from_file.read())
-        from_file.seek(padding, 0)
-
-        read = from_file.read()
-        print len(read)
-
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(read)
-        from_file.close()
-        to_file.close()
-        pass
-
-# Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
-#     32,
-#     3, 3, 3)
-
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
-
-# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/mobile/tools/python/modeltools/tools/__init__.py b/mobile/tools/python/modeltools/tools/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mobile/tools/python/modeltools/tools/float2halffloat.py b/mobile/tools/python/modeltools/tools/float2halffloat.py
deleted file mode 100644
index 3df8d43f9548429cef5d49f72fb07f3cef264834..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/tools/float2halffloat.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# encoding:utf-8
-import math
-import re
-
-
-def Real2HalfFloat(data):
-    MINNUM = -65536
-    MAXNUM = 65535
-    FloatVal = 0
-    if data:
-        if data < MINNUM:
-            data = MINNUM
-        if data > MAXNUM:
-            data = MAXNUM
-
-        sign = 0
-        if data < 0:
-            sign = 1
-            data = -data
-
-        exp = math.floor((math.log2(data)))
-        expout = exp + 16
-
-        Mantial = round(data / pow(2, exp - 10)) - 1024
-
-        if expout <= 0:
-            FloatVal = 0
-        else:
-            FloatVal = sign * 32768 + expout * 1024 + Mantial
-    return FloatVal
-
-
-def ReadCfloatData(sourcefile):
-    input = []
-    with open(sourcfile, 'r') as f:
-        for line in f.readlines():
-            line = line.strip()
-            line = re.sub('\s+', ' ', line)  # 两个数字间多个空格
-            input.append(line.split(' '))
-    destfile = sourcefile.replace('.dat', '')
-    destfile = destfile.replace('.txt', '')
-    destfile += 'Out.dat'
-    with open(destfile, 'w') as fw:
-        for i in range(len(input)):
-            if len(input[i]) == 2:
-                real = Real2HalfFloat(float(input[i][0]))
-                imag = Real2HalfFloat(float(input[i][1]))
-                result = real * 65536 + imag
-                if imag and not real:
-                    fw.write('0x0000' + "%X" % result + '\n')
-                elif not imag and not real:
-                    fw.write('0x00000000' + '\n')
-                else:
-                    fw.write('0x' + "%X" % result + '\n')
-            elif len(input[i]) == 1:
-                result = Real2HalfFloat(float(input[i][0]))
-                if result:
-                    fw.write('0x' + "%X" % result + '\n')
-                else:
-                    fw.write('0x0000' + '\n')
-
-
-if __name__ == '__main__':
-    print('Tips: Input number 0 if you want to exit!\n')
-    while True:
-        sourcfile = input("input source file:\n")
-        if sourcfile is '0':
-            break
-        ReadCfloatData(sourcfile)
-        print('Transfer Success!')
diff --git a/mobile/tools/python/modeltools/tools/loader.py b/mobile/tools/python/modeltools/tools/loader.py
deleted file mode 100644
index 55d9cdde20f4183f7758995fdbd82fc81fb649b6..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/tools/loader.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import json
-
-
-def loadmdl(json_path):
-    print('mdl json path : ' + json_path)
-    with open(json_path, 'r') as f:
-        json_dick = json.load(f)
-        # print(json_dick)
-        layers = (json_dick['layer'])
-        for layer in layers:
-            print(layer)
diff --git a/mobile/tools/python/modeltools/tools/model_combine.py b/mobile/tools/python/modeltools/tools/model_combine.py
deleted file mode 100644
index 1fe8e6a9cd928bd15fc0bc92499dcdf501342938..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/tools/model_combine.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# coding=utf-8
-import os
-
-path = "mobilenet/"  # 文件夹目录
-to_file_path = "mobilenet_combine/params"
-files = os.listdir(path)  # 得到文件夹下的所有文件名称
-files.sort(cmp=None, key=str.lower)
-to_file = open(to_file_path, "wb")
-
-for file in files:  # 遍历文件夹
-    if not os.path.isdir(file) and file != ".DS_Store":  # 判断是否是文件夹，不是文件夹才打开
-        f = open(path + "/" + file)  # 打开文件
-        name = f.name
-        print 'name:  ' + name
-        from_file = open(name, "rb")
-        to_file.write(from_file.read())
-        from_file.close()
-
-to_file.close()
diff --git a/mobile/tools/python/modeltools/tools/model_reader.py b/mobile/tools/python/modeltools/tools/model_reader.py
deleted file mode 100644
index 5f6e5f0cb9da8fb349e35211ed56f77bb9cf95da..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/tools/model_reader.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import os
-
-from core import framework_pb2 as framework_pb2
-
-
-def read_model(model_path):
-    print('read_model.')
-    path_8 = unicode(model_path, 'utf8')
-
-    try:
-        with open(path_8, "rb") as f_model:
-            print get_file_size(model_path)
-            desc = framework_pb2.ProgramDesc()
-            desc.ParseFromString(f_model.read())
-            print desc
-            # print desc.blocks
-
-    except IOError:
-        print ": File not found."
-
-
-def get_file_size(file_path):
-    file_path = unicode(file_path, 'utf8')
-    fsize = os.path.getsize(file_path)
-    fsize = fsize / float(1024 * 1024)
-    return round(fsize, 2)
-
-
-path = '/Users/xiebaiyuan/PaddleProject/paddle-mobile/tools/python/modeltools/mobilenet/datas/sourcemodels/mobilenet_example/mobilenet/__model__'
-read_model(path)
diff --git a/mobile/tools/python/modeltools/yolo/__init__.py b/mobile/tools/python/modeltools/yolo/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/mobile/tools/python/modeltools/yolo/mdl2fluid.py b/mobile/tools/python/modeltools/yolo/mdl2fluid.py
deleted file mode 100644
index 2c2d0f3e9498254f26da6ff1b88b8a33e1b31d27..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/yolo/mdl2fluid.py
+++ /dev/null
@@ -1,333 +0,0 @@
-import json
-
-from core import framework_pb2 as framework_pb2, op_types as types
-from yolo.swicher import Swichter
-import shutil
-
-
-def load_mdl(mdl_json_path):
-    # print('mdl json path : ' + mdl_json_path)
-    with open(mdl_json_path, 'r') as f:
-        return json.load(f)
-
-
-class Converter:
-    'convert mdlmodel to fluidmodel'
-
-    def __init__(self, mdl_json_path):
-        self.mdl_json_path = mdl_json_path
-        print mdl_json_path
-        self.mdl_json = load_mdl(self.mdl_json_path)
-        self.program_desc = framework_pb2.ProgramDesc()
-        self.weight_list_ = []
-        self.deepwise_weight_list_ = []
-        # print(json_dick)
-        # layers = (json_dick['layer'])
-        # for layer in layers:
-        #     print(layer)
-
-    def convert(self):
-        print 'convert begin.....'
-        # add block_desc
-        block_desc = self.program_desc.blocks.add()
-        block_desc.idx = 0
-        block_desc.parent_idx = -1
-        self.package_ops(block_desc)
-        self.package_vars(block_desc)
-        print 'blocks: '
-        print self.program_desc.blocks
-        print 'convert end.....'
-        desc_serialize_to_string = self.program_desc.SerializeToString()
-        shutil.rmtree('yolo/datas/newyolo/')
-        shutil.copytree('yolo/datas/multiobjects/float32s_nchw_with_head/', 'yolo/datas/newyolo/')
-
-        f = open("yolo/datas/newyolo/__model__", "wb")
-        f.write(desc_serialize_to_string)
-        f.close()
-
-    def package_ops(self, block_desc):
-
-        self.add_op_feed(block_desc)
-
-        # add ops with layer
-        if 'layer' in self.mdl_json:
-
-            layers_ = self.mdl_json['layer']
-            for layer in layers_:
-                desc_ops_add = block_desc.ops.add()
-
-                # print layer
-                # for i in layer:
-                #     print i
-                if 'name' in layer:
-                    l_name = layer['name']
-                if 'type' in layer:
-                    self.package_ops_type(desc_ops_add, layer)
-
-                if 'weight' in layer:
-                    self.package_ops_weight2inputs(desc_ops_add, layer)
-
-                if 'output' in layer:
-                    self.package_ops_outputs(desc_ops_add, layer)
-
-                if 'input' in layer:
-                    self.package_ops_inputs(desc_ops_add, layer)
-
-                self.package_ops_attrs(desc_ops_add, layer)
-
-        self.add_op_fetch(block_desc)
-
-    def add_op_feed(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('feed')
-        desc_ops_add.type = 'feed'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('data')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    def add_op_fetch(self, block_desc):
-        desc_ops_add = block_desc.ops.add()
-        inputs_add = desc_ops_add.inputs.add()
-        inputs_add.parameter = 'X'
-        inputs_add.arguments.append('conv_pred_87')
-        desc_ops_add.type = 'fetch'
-        outputs_add = desc_ops_add.outputs.add()
-        outputs_add.parameter = 'Out'
-        outputs_add.arguments.append('fetch')
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'col'
-        # boolean
-        attrs_add.type = 0
-        attrs_add.i = 0
-
-    @staticmethod
-    def package_ops_attrs(desc_ops_add, layer):
-        # print l_params
-        # print desc_ops_add.type
-        if desc_ops_add.type == types.op_fluid_fusion_conv_add:
-            Converter.pack_fusion_conv_add_attr(desc_ops_add, layer)
-        elif desc_ops_add.type == types.op_fluid_relu:
-            # fusion_conv_add : attrs
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'use_mkldnn'
-            # boolean
-            attrs_add.type = 6
-            attrs_add.b = 0
-
-    @staticmethod
-    def pack_fusion_conv_add_attr(desc_ops_add, layer):
-
-        # fusion_conv_add : attrs
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'workspace_size_MB'
-        # 0-->INT
-        attrs_add.type = 0
-        attrs_add.i = 4096
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'data_format'
-        # 2-->STRING
-        attrs_add.type = 2
-        attrs_add.s = 'AnyLayout'
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_mkldnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 0
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'use_cudnn'
-        # boolean
-        attrs_add.type = 6
-        attrs_add.b = 1
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'dilations'
-        # ints
-        attrs_add.type = 3
-        attrs_add.ints.append(1)
-        attrs_add.ints.append(1)
-
-        attrs_add = desc_ops_add.attrs.add()
-        attrs_add.name = 'axis'
-        # int
-        attrs_add.type = 0
-        attrs_add.i = 1
-
-        if 'param' in layer:
-            l_params = layer['param']
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'paddings'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('paddings')])
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'strides'
-            # ints
-            attrs_add.type = 3
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-            attrs_add.ints.append(l_params[types.fusion_conv_add_attrs_dict.get('strides')])
-
-            attrs_add = desc_ops_add.attrs.add()
-            attrs_add.name = 'groups'
-            # int
-            attrs_add.type = 0
-            attrs_add.i = l_params[types.fusion_conv_add_attrs_dict.get('groups')]
-            # attrs_add.i = 1
-
-        #
-        # op_attrs_tupl = types.op_io_dict.get(desc_ops_add.type) \
-        #     .get(types.mdl_attrs_key)
-        #
-        #
-        #
-        #
-        # # group stride padding
-        # print '----------------------'
-        # for i, val in enumerate(op_attrs_tupl):
-        #     attrs_add = desc_ops_add.attrs.add()
-        #     attr_name = op_attrs_tupl[i]
-        #     print attr_name
-        #     attrs_add.name = attr_name
-        #     attrs_add.type = types.fluid_attrs_type_dict.get(attr_name)
-        #     attrs_add.
-        #     print l_params[types.fusion_conv_add_attrs_dict.get(attr_name)]
-
-        # for p in l_params:
-        #     attrs_add = desc_ops_add.attrs.add()
-
-    @staticmethod
-    def package_ops_inputs(desc_ops_add, layer):
-        l_inputs = layer['input']
-        for i in l_inputs:
-            inputs_add = desc_ops_add.inputs.add()
-            # print i
-            inputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_inputs_key)
-            inputs_add.arguments.append(i)
-
-    @staticmethod
-    def package_ops_outputs(desc_ops_add, layer):
-        l_outputs = layer['output']
-        for o in l_outputs:
-            # print o
-            outputs_add = desc_ops_add.outputs.add()
-            outputs_add.parameter = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_outputs_key)
-            outputs_add.arguments.append(o)
-
-    def package_ops_weight2inputs(self, desc_ops_add, layer):
-        l_weights = layer['weight']
-        for w in l_weights:
-            self.weight_list_.append(w)
-
-        if layer['type'] == 'DepthwiseConvolutionLayer':
-            # print l_weights[0]
-            self.deepwise_weight_list_.append(l_weights[0])
-
-        op_weight_tup = types.op_io_dict.get(desc_ops_add.type).get(types.mdl_weight_key)
-        # print len(op_weight_tup)
-        for i, val in enumerate(op_weight_tup):
-            # print i
-            # print val
-            inputs_add = desc_ops_add.inputs.add()
-            inputs_add.parameter = op_weight_tup[i]
-            inputs_add.arguments.append(l_weights[i])
-
-        # for w in l_weights:
-        #     inputs_add = desc_ops_add.inputs.add()
-        #     # print w
-        #     inputs_add.parameter = op_weight_tup[0]
-        #     inputs_add.arguments.append(w)
-
-    @staticmethod
-    def package_ops_type(desc_ops_add, layer):
-        l_type = layer['type']
-        # print l_type
-        # print mdl2fluid_op_layer_dict.get(l_type)
-        desc_ops_add.type = types.mdl2fluid_op_layer_dict.get(l_type)
-
-    def package_vars(self, block_desc):
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'feed'
-        vars_add.type.type = 9  # 9 is FEED_MINIBATCH
-        vars_add.persistable = 1
-        # fetch
-        vars_add = block_desc.vars.add()
-        vars_add.name = 'fetch'
-        vars_add.type.type = 10  # 10 is fetch list
-        vars_add.persistable = 1
-
-        json_matrix_ = self.mdl_json['matrix']
-        # print json_matrix_
-        for j in json_matrix_:
-            vars_add = block_desc.vars.add()
-            vars_add.name = j
-            vars_add.type.type = 7  # 7 is lodtensor
-            # print j
-            tensor = vars_add.type.lod_tensor.tensor
-            tensor.data_type = 5  # 5 is FP32
-
-            # print json_matrix_
-
-            dims_of_matrix = json_matrix_.get(j)
-            # dims_size = len(dims_of_matrix)
-            # print dims_size
-
-            # if dims_size == 4:
-            #     tensor.dims.append(dims_of_matrix[0])  # N
-            #     tensor.dims.append(dims_of_matrix[3])  # C
-            #     tensor.dims.append(dims_of_matrix[1])  # H
-            #     tensor.dims.append(dims_of_matrix[2])  # W
-            # else:
-
-            # issues in mdl model filter swich n and c
-            if j in self.deepwise_weight_list_ and len(dims_of_matrix) == 4:
-                print j
-                tensor.dims.append(dims_of_matrix[1])
-                tensor.dims.append(dims_of_matrix[0])
-                tensor.dims.append(dims_of_matrix[2])
-                tensor.dims.append(dims_of_matrix[3])
-                print tensor.dims
-            else:
-                for dims in dims_of_matrix:
-                    # print dims
-                    tensor.dims.append(dims)
-
-            if j in self.weight_list_:
-                vars_add.persistable = 1
-                dims_size = len(dims_of_matrix)
-                # print dims_size
-                if dims_size == 4:
-                    # convert weight from nhwc to nchw
-                    Swichter().nhwc2nchw_one_slice_add_head(
-                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp',
-                        dims_of_matrix[0],
-                        dims_of_matrix[1],
-                        dims_of_matrix[2],
-                        dims_of_matrix[3]
-                    )
-                else:
-                    Swichter().copy_add_head(
-                        'yolo/datas/multiobjects/float32s_nhwc/' + j + '.bin',
-                        'yolo/datas/multiobjects/float32s_nchw_with_head/' + j,
-                        'yolo/datas/multiobjects/float32s_nchw/' + j + '.tmp'
-                    )
-            else:
-                vars_add.persistable = 0
-
-
-mdl_path = "yolo/datas/multiobjects/YOLO_Universal.json"
-converter = Converter(mdl_path)
-converter.convert()
diff --git a/mobile/tools/python/modeltools/yolo/swicher.py b/mobile/tools/python/modeltools/yolo/swicher.py
deleted file mode 100644
index 713ce93985957fe7f3c99d6bc6a9c436faea59a4..0000000000000000000000000000000000000000
--- a/mobile/tools/python/modeltools/yolo/swicher.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from array import array
-
-
-class Swichter:
-    def __init__(self):
-        pass
-
-    def nhwc2nchw_one_slice(self, from_file_name, to_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(to_file)
-        from_file.close()
-        to_file.close()
-
-    def copy(self, from_file_name, to_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-
-    def nhwc2nchw_one_slice_add_head(self, from_file_name, to_file_name, tmp_file_name, batch, channel, height, width):
-        from_file = open(from_file_name, "rb")
-        tmp_file = open(tmp_file_name, "wb+")
-        float_array = array("f")
-        float_array.fromfile(from_file, width * height * batch * channel)
-        float_write_array = array("f")
-
-        for b in range(batch):
-            for c in range(channel):
-                for h in range(height):
-                    for w in range(width):
-                        float_value = float_array[b * channel * width * height
-                                                  + channel * (h * width + w) + c]
-
-                        float_write_array.append(float_value)
-
-        float_write_array.tofile(tmp_file)
-        tmp_file.close()
-        from_file.close()
-
-        tmp_file = open(tmp_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-
-        tmp = tmp_file.read()
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(tmp)
-        tmp_file.close()
-        to_file.close()
-
-    def read_head(self, head_file):
-        from_file = open(head_file, "rb")
-        read = from_file.read(24)
-        # print read
-        from_file.close()
-        # print read
-        return read
-
-    def copy_add_head(self, from_file_name, to_file_name, tmp_file_name):
-        from_file = open(from_file_name, "rb")
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(from_file.read())
-        from_file.close()
-        to_file.close()
-        pass
-
-    def copy_padding_add_head(self, from_file_name, to_file_name, tmp_file_name, padding):
-        print'padding  = %d' % padding
-        from_file = open(from_file_name, "rb")
-        # print len(from_file.read())
-        from_file.seek(padding, 0)
-
-        read = from_file.read()
-        print len(read)
-
-        to_file = open(to_file_name, "wb")
-        # tmp_file = open(tmp_file_name, "wb")
-
-        head = self.read_head('yolo/datas/yolo/head')
-        to_file.write(head)
-        to_file.write(read)
-        from_file.close()
-        to_file.close()
-        pass
-
-# Swichter().nhwc2nchw_one_slice_add_head(
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nhwc/conv1_0.bin',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw_with_head/conv1_0',
-#     '/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/multiobjects/float32s_nchw/.tmp',
-#     32,
-#     3, 3, 3)
-
-# Swichter().read_head('/Users/xiebaiyuan/PaddleProject/paddle-mobile/python/tools/modeltools/yolo/head')
-
-# Swichter().copy_add_head('datas/model.0.0.weight', 'datas/conv1_0', '')
diff --git a/mobile/tools/quantification/CMakeLists.txt b/mobile/tools/quantification/CMakeLists.txt
deleted file mode 100644
index 13a4fb87b9412bf2f20bbbec08c8dd44f839cfb3..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-cmake_minimum_required(VERSION 3.6)
-project(quali)
-add_definitions(-DENABLE_EXCEPTION)
-
-set(CMAKE_CXX_STANDARD 11)
-file(GLOB_RECURSE QULIFICATON_CC src/*.cc src/*.cpp src/*.c src/*.mm)
-file(GLOB_RECURSE QULIFICATON_H src/*.h)
-include_directories(. src/)
-
-#add_library(paddle-mobile SHARED ${QULIFICATON_CC} ${QULIFICATON_H} convert.cpp)
-
-add_executable(quantify convert.cpp ${QULIFICATON_CC} ${QULIFICATON_H})
diff --git a/mobile/tools/quantification/README.md b/mobile/tools/quantification/README.md
deleted file mode 100644
index c2f9e632490b1f5e0253617471657e10c72d2b76..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/README.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# 模型量化脚本
-
-#### 量化脚本使用指南
-1. 在PaddleMobile项目目录下（如 ~/PaddleProject/paddle-mobile）
-
-2. cd到  tools/quantification/ 目录
-
-3. cmake编译
-
-    ``` sh
-    cmake .
-    make
-    ```
-
-4. 运行量化脚本
-    ```sh
-    ./quantify (0:seperated. 1:combined ) (输入路径) (输出路径)
-    # quantify googlenet seperated   from  /Users/xiebaiyuan/PaddleProject/quali/models/googlenet to ./googlenet_min
-    ./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min 
-
-    ```
-
-*注:*
-*量化工具中*
-*1.seperated模型model文件默认命名为 "__model__";*
-*2.combined模型的model文件默认命名为 "model",参数文件默认命名为"params";*
-
-    
-##### 整体如下:
-以googlenet非combined为例：
-
-```sh
-cd tools/quantification/
-cmake .
-make
-./quantify 0 /Users/xiebaiyuan/PaddleProject/quali/models/googlenet ./googlenet_min
-```
diff --git a/mobile/tools/quantification/convert.cpp b/mobile/tools/quantification/convert.cpp
deleted file mode 100644
index 0d675de205296c8942e59575dffb7f7002bc7d7f..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/convert.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-
-
-#include "src/enforce.h"
-#include "src/var_desc.h"
-#include "src/program_desc.h"
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include <iostream>
-#include <utility>
-#include <vector>
-#include "src/framework.pb-c.h"
-#include "src/protobuf-c.h"
-#include <fstream>
-#include <iostream>
-#include <limits>
-
-const size_t kSize64 = sizeof(uint64_t);
-const size_t kSize32 = sizeof(uint32_t);
-const int minimal_fold_size = 2;
-float max_entropy = 0.0;
-
-float entropy(std::vector<uint8_t> &factors) {
-    int n = factors.size();
-    std::vector<int> counts(256);
-    for (uint8_t &factor : factors) {
-        counts[factor]++;
-    }
-    float res = 1.0;
-    float shift = 100000.0;
-    for (int i = 0; i < 256; i++) {
-        res *= (counts[i] + shift) / (n + shift);
-    }
-    return 1.0 / res;
-}
-
-char *Get_binary_data(const std::string &filename) {
-
-    FILE *file = fopen(filename.c_str(), "rb");
-
-    PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                          filename.c_str());
-    fseek(file, 0, SEEK_END);
-    int64_t size = ftell(file);
-
-    PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-    rewind(file);
-    auto *data = new char[size];
-    size_t bytes_read = fread(data, 1, static_cast<size_t>(size), file);
-    PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                          "read binary file bytes do not match with fseek");
-    fclose(file);
-    return data;
-}
-
-
-static size_t ReadBuffer(const char *file_name, uint8_t **out) {
-    FILE *fp;
-    fp = fopen(file_name, "rb");
-    PADDLE_MOBILE_ENFORCE(fp != nullptr, " %s open failed !", file_name);
-    fseek(fp, 0, SEEK_END);
-    auto size = static_cast<size_t>(ftell(fp));
-    rewind(fp);
-    *out = reinterpret_cast<uint8_t *>(malloc(size));
-    size_t cur_len = 0;
-    size_t nread;
-    while ((nread = fread(*out + cur_len, 1, size - cur_len, fp)) != 0) {
-        cur_len += nread;
-    }
-    fclose(fp);
-    return cur_len;
-}
-
-std::shared_ptr<ProgramDesc> loadParams(const std::string &model_path) {
-    PaddleMobile__Framework__Proto__ProgramDesc *c_program;
-    uint8_t *buf = nullptr;
-    size_t read_size = ReadBuffer(model_path.c_str(), &buf);
-    PADDLE_MOBILE_ENFORCE(buf != nullptr, "read from __model__ is null");
-    c_program = paddle_mobile__framework__proto__program_desc__unpack(
-            nullptr, read_size, buf);
-    PADDLE_MOBILE_ENFORCE(c_program != nullptr, "program is null");
-    auto originProgramDesc = std::make_shared<ProgramDesc>(c_program);
-    return originProgramDesc;
-
-}
-
-void LoadWithDumpForInt8(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
-    // 1. version
-    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
-
-    // write version
-    fwrite(&version, kSize32, 1, out_file);
-
-    *dataP += kSize32;
-
-    // 2 Lod information
-    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, *dataP, kSize64);
-
-    uint64_t lod_level = 0;
-    // write lod Information
-    fwrite(&lod_level, kSize64, 1, out_file);
-    delete lod_level_ptr;
-
-    *dataP += kSize64;
-
-    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
-        // write lod size
-        fwrite(&size, kSize64, 1, out_file);
-        (*dataP) += kSize64;
-
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast<size_t *>(*dataP);
-            (*dataP) += sizeof(size_t);
-        }
-        // write lod size vector
-        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
-    }
-
-    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
-    // write tensor version
-    fwrite(&tensor_version, kSize32, 1, out_file);
-    (*dataP) += kSize32;
-
-    // 4. tensor desc
-    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
-    // write tensor desc
-    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (*dataP) += sizeof(int32_t);
-
-    std::unique_ptr<char[]> buf(new char[size]);
-    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (*dataP)[m];
-    }
-
-    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
-    (*dataP) += (sizeof(char) * size);
-
-    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
-    int memory_size = 1;
-    for (auto l : desc.Dims()) {
-        memory_size *= l;
-    }
-
-    void *memory = nullptr;
-    int type_size = 0;
-    switch (desc.DataType()) {
-        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
-            type_size = 2;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-            type_size = 1;
-            break;
-        default:
-            break;
-    }
-    size_t tensorSize = sizeof(char) * memory_size * type_size;
-
-    memory = new char[tensorSize];
-
-    for (int n = 0; n < tensorSize; ++n) {
-        static_cast<char *>(memory)[n] = (*dataP)[n];
-    }
-    *dataP += tensorSize;
-
-    quantification_fold = std::min(std::max(1, memory_size / minimal_fold_size), quantification_fold);
-    int step = std::max(memory_size / quantification_fold, 1);
-
-    int visited_fold = 0;
-    while (visited_fold * step < memory_size) {
-        // for float 32
-        float min_value = std::numeric_limits<float>::max();
-        float max_value = std::numeric_limits<float>::min();
-
-        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
-            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-        }
-
-        fwrite(&min_value, sizeof(float), 1, out_file);
-        fwrite(&max_value, sizeof(float), 1, out_file);
-
-        std::vector<uint8_t> factors;
-        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
-            float value = static_cast<float *> (memory)[g];
-            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-            factors.push_back(factor);
-            fwrite(&factor, sizeof(uint8_t), 1, out_file);
-        }
-        max_entropy = fmax(max_entropy, entropy(factors));
-        visited_fold++;
-    }
-}
-
-void
-quantificate_combined_int8(const std::string &model_path, const std::string &param_path, const std::string &param_min_path, int quantification_fold) {
-    auto program = loadParams(model_path);
-    char *origin_data = Get_binary_data(param_path);
-    char *data = origin_data;
-    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
-            }
-        }
-    }
-    fclose(out_file);
-    delete origin_data;
-}
-
-void quantificate_seperated_int8(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
-    auto program = loadParams(model_dir + "/__model__");
-
-    std::string shell_command = "mkdir " + param_min_path;
-    system(shell_command.c_str());
-
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                std::string file_name = param_min_path + "/" + var_desc->Name();
-                FILE *out_file = fopen(file_name.c_str(), "wb");
-                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
-                char *data = origin_data;
-                LoadWithDumpForInt8(*var_desc, &data, out_file, quantification_fold);
-                delete origin_data;
-                fclose(out_file);
-            }
-        }
-    }
-}
-
-void LoadWithDumpForFloat32(const paddle_mobile::framework::VarDesc &var_desc, char **dataP, FILE *out_file, int quantification_fold) {
-    // 1. version
-    uint32_t version = *reinterpret_cast<uint32_t *>(*dataP);
-
-    // write version
-    fwrite(&version, kSize32, 1, out_file);
-
-    *dataP += kSize32;
-
-    // 2 Lod information
-    auto *lod_level_ptr = new uint64_t();
-    memcpy(lod_level_ptr, *dataP, kSize64);
-
-    uint64_t lod_level = 0;
-    // write lod Information
-    fwrite(&lod_level, kSize64, 1, out_file);
-    delete lod_level_ptr;
-
-    *dataP += kSize64;
-
-    for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size = *reinterpret_cast<uint64_t *>(*dataP);
-        // write lod size
-        fwrite(&size, kSize64, 1, out_file);
-        (*dataP) += kSize64;
-
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        for (unsigned long &k : tmp) {
-            k = *reinterpret_cast<size_t *>(*dataP);
-            (*dataP) += sizeof(size_t);
-        }
-        // write lod size vector
-        fwrite(&tmp, sizeof(size_t), tmp.size(), out_file);
-    }
-
-    // 3. tensor version
-    uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*dataP);
-    // write tensor version
-    fwrite(&tensor_version, kSize32, 1, out_file);
-    (*dataP) += kSize32;
-
-    // 4. tensor desc
-    int32_t size = *reinterpret_cast<int32_t *>(*dataP);
-    // write tensor desc
-    fwrite(&size, sizeof(int32_t), 1, out_file);
-    (*dataP) += sizeof(int32_t);
-
-    std::unique_ptr<char[]> buf(new char[size]);
-    for (int m = 0; m < size; ++m) {
-        buf.get()[m] = (*dataP)[m];
-    }
-
-    fwrite(buf.get(), sizeof(char), static_cast<size_t>(size), out_file);
-    (*dataP) += (sizeof(char) * size);
-
-    const paddle_mobile::framework::TensorDesc &desc = var_desc.Tensor_desc();
-    int memory_size = 1;
-    for (auto l : desc.Dims()) {
-        memory_size *= l;
-    }
-
-    void *memory = nullptr;
-    int type_size = 0;
-    switch (desc.DataType()) {
-        case paddle_mobile::framework::VARTYPE_TYPE_FP16:
-            type_size = 2;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_FP64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT32:
-            type_size = 4;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_INT64:
-            type_size = 8;
-            break;
-        case paddle_mobile::framework::VARTYPE_TYPE_BOOL:
-            type_size = 1;
-            break;
-        default:
-            break;
-    }
-    size_t tensorSize = sizeof(char) * memory_size * type_size;
-
-    memory = new char[tensorSize];
-
-    for (int n = 0; n < tensorSize; ++n) {
-        static_cast<char *>(memory)[n] = (*dataP)[n];
-    }
-    *dataP += tensorSize;
-
-    quantification_fold = std::min(std::max(1, memory_size / minimal_fold_size), quantification_fold);
-    int step = std::max(memory_size / quantification_fold, 1);
-
-    int visited_fold = 0;
-    while (visited_fold * step < memory_size) {
-        // for float 32
-        float min_value = std::numeric_limits<float>::max();
-        float max_value = std::numeric_limits<float>::min();
-
-        for (int k = visited_fold * step; k < std::min((visited_fold + 1) * step, memory_size); ++k) {
-            min_value = std::min(min_value, static_cast<float *> (memory)[k]);
-            max_value = std::max(max_value, static_cast<float *> (memory)[k]);
-        }
-
-        float diff = 0.0;
-        std::vector<uint8_t> factors;
-        for (int g = visited_fold * step; g < std::min((visited_fold + 1) * step, memory_size); ++g) {
-            float value = static_cast<float *> (memory)[g];
-            auto factor = (uint8_t) round((value - min_value) / (max_value - min_value) * 255);
-            factors.push_back(factor);
-            float value_quantized = min_value + (factor / 255.0) * (max_value - min_value);
-            diff += fabs(value - value_quantized);
-            fwrite(&value_quantized, sizeof(float), 1, out_file);
-        }
-        max_entropy = fmax(max_entropy, entropy(factors));
-        if (memory_size > 0) {
-            std::cout << "avg diff caused by quantization for var " << var_desc.Name() << " is: " << (diff / memory_size) << std::endl;
-        }
-        visited_fold++;
-    }
-}
-
-void
-quantificate_combined_float32(const std::string &model_path, const std::string &param_path, const std::string &param_min_path, int quantification_fold) {
-    auto program = loadParams(model_path);
-    char *origin_data = Get_binary_data(param_path);
-    char *data = origin_data;
-    FILE *out_file = fopen(param_min_path.c_str(), "wb");
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
-            }
-        }
-    }
-    fclose(out_file);
-    delete origin_data;
-}
-
-void quantificate_seperated_float32(const std::string model_dir, const std::string param_min_path, int quantification_fold) {
-    auto program = loadParams(model_dir + "/__model__");
-
-    std::string shell_command = "mkdir " + param_min_path;
-    system(shell_command.c_str());
-
-    for (const auto &block : program->Blocks()) {
-        for (const auto &var_desc : block->Vars()) {
-            if (var_desc->Persistable()) {
-                if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
-                    continue;
-                }
-                std::string file_name = param_min_path + "/" + var_desc->Name();
-                FILE *out_file = fopen(file_name.c_str(), "wb");
-                char *origin_data = Get_binary_data(model_dir + "/" + var_desc->Name());
-                char *data = origin_data;
-                LoadWithDumpForFloat32(*var_desc, &data, out_file, quantification_fold);
-                delete origin_data;
-                fclose(out_file);
-            }
-        }
-    }
-}
-
-int main(int argc, char **argv) {
-    const std::string kNoteEg = "( eg:  ./quantify 1 your_combined_model_path output_path  or  ./quantify 0 your_seperated_model_path output_path  or  ./quantify 3 your_seperated_model_path output_path  or  ./quantify 2 your_seperated_model_path output_path)";
-
-    PADDLE_MOBILE_ENFORCE(argc > 1, "wee need params.%s ", kNoteEg.c_str());
-
-    std::string action_type = argv[1];
-    PADDLE_MOBILE_ENFORCE(argc > 1 && (action_type) == "0" || action_type == "1" || action_type == "2" || action_type == "3",
-                          "only 0, 1, 2 or 3 supported, current is %s %s ",
-                          action_type.c_str(),
-                          kNoteEg.c_str());
-
-    PADDLE_MOBILE_ENFORCE(argc > 2, "we need your model path. %s ", kNoteEg.c_str());
-    std::string base_path = argv[2];
-
-    PADDLE_MOBILE_ENFORCE(argc > 3, "we need your output path. %s ", kNoteEg.c_str());
-    std::string output_path = argv[3];
-
-    int quantification_fold = 1;
-    if (argc > 4) {
-        quantification_fold = std::stoi(argv[4]);
-    }
-
-    if (action_type == "0") {
-        // for seperated
-        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_int8(base_path, seperated_min_dir, quantification_fold);
-        return 0;
-    }
-
-    if (action_type == "1") {
-        // for combined
-        const std::string &combined_min_dir = output_path;
-        std::string model_path = base_path + "/model";
-        std::string param_path = base_path + "/params";
-        quantificate_combined_int8(model_path, param_path, combined_min_dir, quantification_fold);
-        std::cout << "max entropy : " << max_entropy << std::endl;
-        return 0;
-    }
-
-    if (action_type == "2") {
-        // for seperated
-        const std::string &seperated_min_dir = output_path;
-        quantificate_seperated_float32(base_path, seperated_min_dir, quantification_fold);
-        return 0;
-    }
-
-    if (action_type == "3") {
-        // for combined
-        const std::string &combined_min_dir = output_path;
-        std::string model_path = base_path + "/model";
-        std::string param_path = base_path + "/params";
-        quantificate_combined_float32(model_path, param_path, combined_min_dir, quantification_fold);
-        return 0;
-    }
-
-    return -1;
-}
diff --git a/mobile/tools/quantification/scripts/run.py b/mobile/tools/quantification/scripts/run.py
deleted file mode 100644
index bf3444147092f0963e918369e933057dd5f28b38..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/scripts/run.py
+++ /dev/null
@@ -1,661 +0,0 @@
-# -*- coding: utf-8 -*
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-model_path = "model"
-checked_model_path = "quantification_model"
-feed_path = "feeds"
-output_path = "outputs"
-diff_threshold = 0.1
-is_lod = False
-mobile_model_path = ""
-fast_check = False
-is_sample_step = False
-sample_step = 1
-sample_num = 20
-need_encrypt = False
-checked_encrypt_model_path = "checked_encrypt_model"
-output_var_filter = []
-output_key_filter = {}
-check_shape = False
-quantification = True
-quantification_fold = int(sys.argv[1])
-architecture = "arm-v7a"
-# architecture = "arm-v8a"
-
-np.set_printoptions(linewidth=150)
-
-mobile_exec_root = "/data/local/tmp/bin"
-mobile_src_root = os.path.abspath("../../../")
-if mobile_src_root.endswith("/"):
-    mobile_src_root = mobile_src_root[:-1]
-
-dot = "•"
-black = lambda x: "\033[30m" + str(x) + "\033[0m"
-red = lambda x: "\033[31m" + str(x) + "\033[0m"
-green = lambda x: "\033[32m" + str(x) + "\033[0m"
-yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
-reset = lambda x: "\033[0m" + str(x)
-
-def pp_tab(x, level=0):
-    header = ""
-    for i in range(0, level):
-        header += "\t"
-    # print(header + str(x))
-def pp_black(x, level=0):
-    pp_tab(black(x) + reset(""), level)
-def pp_red(x, level=0):
-    pp_tab(red(x) + reset(""), level)
-def pp_green(x, level=0):
-    pp_tab(green(x) + reset(""), level)
-def pp_yellow(x, level=0):
-    pp_tab(yellow(x) + reset(""), level)
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-def push(src, dest=""):
-    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
-
-pp_yellow(dot + " start inspecting fluid model")
-
-exe = fluid.Executor(fluid.CPUPlace())
-exe.run(fluid.default_startup_program())
-
-# 加载模型
-def load_model(model_path):
-    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
-    return (prog, feeds, fetches)
-
-prog, feeds, fetches = load_model(model_path)
-
-# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
-def resave_model(feed_kv):
-    if len(mobile_model_path) > 0:
-        pp_green("has set mobile_model_path, stop checking model & params", 1)
-        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
-        return
-    ops = prog.current_block().ops
-    vars = prog.current_block().vars
-    # 强制所有var为可持久化
-    p_names = []
-    for name in vars:
-        name = str(name)
-        v = fluid.framework._get_var(name, prog)
-        if not v.persistable:
-            v.persistable = True
-            p_names.append(name)
-    outputs = run_model(feed_kv=feed_kv)
-    has_found_wrong_shape = False
-    # 修正每个var的形状
-    for name in vars:
-        name = str(name)
-        v = vars[name]
-        if v.persistable:
-            v1 = fluid.global_scope().find_var(name)
-            try:
-                t1 = v1.get_tensor()
-                shape = t1.shape()
-            except:
-                continue
-            if v.desc.shape() != shape:
-                has_found_wrong_shape = True
-            v.desc.set_shape(shape)
-    # 恢复var的可持久化属性
-    for name in p_names:
-        v = fluid.framework._get_var(name, prog)
-        v.persistable = False
-    if not quantification:
-        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
-    if has_found_wrong_shape:
-        pp_red("has found wrong shape", 1)
-    else:
-        pp_green("has not found wrong shape", 1)
-    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
-
-# 分别加密model和params，加密key使用同一个
-def encrypt_model():
-    if not need_encrypt:
-        return
-    pp_yellow(dot + dot + " encrypting model")
-    if not os.path.exists(checked_encrypt_model_path):
-        os.mkdir(checked_encrypt_model_path)
-    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
-    lines = res.split("\n")
-
-    for line in lines:
-        if line.startswith("key:"):
-            line = line.replace('key:','')
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i {}/model -o {}/model.ml".format(line, checked_model_path, checked_model_path))
-            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i {}/params -o {}/params.ml".format(line, checked_model_path, checked_model_path))
-            pp_green("model has been encrypted, key is : {}".format(line), 1)
-            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
-            return
-    pp_red("model encrypt error", 1)
-
-# 生成feed的key-value对
-def gen_feed_kv():
-    feed_kv = {}
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        data = np.random.random(feed_shape).astype("float32")
-        feed_kv[feed_name] = data
-    return feed_kv
-
-# 保存feed的key-value对
-def save_feed_kv(feed_kv):
-    for feed_name in feed_kv:
-        feed_data = feed_kv[feed_name]
-        feed_list = feed_data.flatten().tolist()
-        if not os.path.exists(feed_path):
-            os.mkdir(feed_path)
-        file_name = feed_name.replace("/", "_")
-        out_file = open(feed_path + "/" + file_name, "w")
-        for feed_item in feed_list:
-            out_file.write("{}\n".format(feed_item))
-        out_file.close()
-
-last_feed_var_name = None
-last_feed_file_name = None
-last_feed_var_lod = None
-# 加载feed的key-value对
-def load_feed_kv():
-    if not os.path.exists(feed_path):
-        return None
-    global last_feed_var_name
-    global last_feed_file_name
-    global last_feed_var_lod
-    feed_kv = {}
-    pp_yellow(dot + dot + " checking feed info")
-    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
-    for feed_name in feeds:
-        feed_shape = get_feed_var_shape(feed_name)
-        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
-        file_name = feed_name.replace("/", "_")
-        last_feed_var_name = feed_name
-        last_feed_file_name = file_name
-        feed_file_path = feed_path + "/" + file_name
-        if not os.path.exists(feed_file_path):
-            return None
-        data = np.loadtxt(feed_file_path)
-        expected_len = 1
-        for dim in feed_shape:
-            expected_len *= dim
-        if len(np.atleast_1d(data)) != expected_len:
-            return None
-        data = data.reshape(feed_shape).astype("float32")
-        
-        if is_lod:
-            data_shape = [1]
-            for dim in feed_shape:
-                data_shape.append(dim)
-            data = data.reshape(data_shape).astype("float32")
-            tensor = fluid.LoDTensor()
-            seq_lens = [len(seq) for seq in data]
-            cur_len = 0
-            lod = [cur_len]
-            for l in seq_lens:
-                cur_len += l
-                lod.append(cur_len)
-            data = data.reshape(feed_shape)
-            tensor.set(data, fluid.CPUPlace())
-            tensor.set_lod([lod])
-            last_feed_var_lod = lod
-            feed_kv[feed_name] = tensor
-        else:
-            feed_kv[feed_name] = data
-    return feed_kv
-
-# 运行模型
-def run_model(feed_kv=None):
-    if feed_kv is None:
-        feed_kv = gen_feed_kv()
-    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
-    results = []
-    for output in outputs:
-        results.append(np.array(output))
-    return results
-
-# 获取变量形状
-def get_var_shape(var_name):
-    vars = prog.current_block().vars
-    shape = vars[var_name].desc.shape()
-    for i in range(len(shape)):
-        dim = shape[i]
-        if dim == -1:
-            shape[i] = 1
-    return shape
-
-# 获取输入变量形状
-def get_feed_var_shape(var_name):
-    # 如果想写死输入形状，放开以下语句
-    # return [1, 3, 224, 224]
-    return get_var_shape(var_name)
-
-persistable_cache = []
-# 所有var，全部变成持久化
-def force_all_vars_to_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if not persistable:
-            persistable_cache.append(var_name)
-            v.persistable = True
-
-# 恢复持久化属性
-def restore_all_vars_persistable():
-    global persistable_cache
-    for var_name in vars.keys():
-        var_name = str(var_name)
-        v = fluid.framework._get_var(var_name, prog)
-        persistable = v.persistable
-        if var_name in persistable_cache:
-            v.persistable = False
-    persistable_cache = []
-
-# 获取var的数据
-def get_var_data(var_name, feed_kv=None):
-    output = np.array(fluid.global_scope().var(var_name).get_tensor())
-    return output
-
-output_var_cache = {}
-def tensor_sample(tensor):
-    if is_sample_step:
-        step = sample_step
-    else:
-        step = math.floor(len(tensor) / sample_num)
-    step = max(step, 1)
-    step = int(step)
-    sample = []
-    for i in range(0, len(tensor), step):
-        sample.append(tensor[i])
-    return sample
-
-op_cache = {}
-# 获取每层输出的数据
-def save_all_op_output(feed_kv=None):
-    force_all_vars_to_persistable()
-    outputs = run_model(feed_kv=feed_kv)
-    if not os.path.exists(output_path):
-        os.mkdir(output_path)
-    ops = prog.current_block().ops
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    feed_names = feeds
-    if len(output_var_filter) > 0:
-        for fetch_name in fetch_names:
-            output_var_filter.append(fetch_name)
-    for i in range(len(ops)):
-        op = ops[i]
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in ["Y", "Out", "Output"]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            for name in op.output_arg_names:
-                var_name = name
-                if "tmp" in name:
-                    break
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    for i in range(len(ops)):
-        op = ops[i]
-        if op.type not in output_key_filter:
-            continue
-        var_name = None
-        var_name_index = -1
-        for index in range(len(op.output_names)):
-            if op.output_names[index] in output_key_filter[op.type]:
-                var_name_index = index
-                break
-        if var_name_index != -1:
-            var_name = op.output_arg_names[var_name_index]
-        else:
-            continue
-        if len(output_var_filter) > 0:
-            if var_name not in output_var_filter:
-                continue
-        # real_var_name = None
-        # if op.type == "fetch":
-        #     for name in op.input_arg_names:
-        #         real_var_name = name
-        #         if "tmp" in name:
-        #             break
-        # else:
-        #     real_var_name = var_name
-        if fast_check:
-            if var_name not in fetch_names and var_name not in feed_names:
-                continue
-        try:
-            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
-            sample = tensor_sample(data)
-            output_var_cache[var_name] = (sample)
-            op_cache[i] = (var_name, op)
-            file_name = var_name.replace("/", "_")
-            out_file = open(output_path + "/" + file_name, "w")
-            if var_name in feed_names:
-                for item in data:
-                    out_file.write("{}\n".format(item))
-            else:
-                for item in sample:
-                    out_file.write("{}\n".format(item))
-            out_file.close()
-        except:
-            pass
-    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
-    restore_all_vars_persistable()
-
-ops = prog.current_block().ops
-vars = prog.current_block().vars
-
-pp_yellow(dot + dot + " checking op list")
-op_types = set()
-for op in ops:
-    op_types.add(op.type)
-pp_tab("op types : {}".format(op_types), 1)
-
-def check_mobile_results(args, fuse, mem_opt):
-    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
-    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net {}\"".format(mobile_exec_root, args))
-    lines = res.split("\n")
-    # for line in lines:
-    #     print(line)
-    for line in lines:
-        if line.startswith("auto-test-debug"):
-            print(line)
-    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
-    mobile_var_cache = {}
-    for line in lines:
-        parts = line.split(" ")
-        if len(parts) < 2:
-            continue
-        if "auto-test" != parts[0]:
-            continue
-        if parts[1] == "load-time-cost":
-            pp_green("load time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "predict-time-cost":
-            pp_green("predict time cost : {}".format(parts[2]), 1) 
-        elif parts[1] == "preprocess-time-cost":
-            pp_green("preprocess time cost : {}".format(parts[2]), 1)
-        elif parts[1] == "var":
-            var_name = parts[2]
-            values = list(map(lambda x: float(x), parts[3:]))
-            mobile_var_cache[var_name] = values
-    error_index = None
-    error_values1 = None
-    error_values2 = None
-    checked_names = []
-    fetch_names = []
-    for fetch in fetches:
-        fetch_names.append(fetch.name)
-    fetch_diff = 0.0
-    fetch_count = 0
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        for i in range(len(values1)):
-            v1 = values1[i]
-            v2 = values2[len(shape) + i]
-            fetch_diff += abs(v1 - v2)
-            fetch_count += 1
-    if fetch_count != 0:
-        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
-        print(fetch_diff / fetch_count)
-    for index in op_cache:
-        op_output_var_name, op = op_cache[index]
-        if mem_opt:
-            found_in_fetch = False
-            for fetch in fetches:
-                if op_output_var_name == fetch.name:
-                    found_in_fetch = True
-                    break
-            if not found_in_fetch:
-                continue
-        if not op_output_var_name in output_var_cache:
-            continue
-        if not op_output_var_name in mobile_var_cache:
-            continue
-        if op_output_var_name not in fetch_names:
-            continue
-        values1 = output_var_cache[op_output_var_name]
-        values2 = mobile_var_cache[op_output_var_name]
-        shape = get_var_shape(op_output_var_name) if check_shape else []
-        if len(values1) + len(shape) != len(values2):
-            error_index = index
-        for i in range(len(shape)):
-            v1 = shape[i]
-            v2 = values2[i]
-            if v1 != v2:
-                error_index = index
-                break
-        if error_index == None:
-            for i in range(len(values1)):
-                v1 = values1[i]
-                v2 = values2[len(shape) + i]
-                if abs(v1 - v2) > diff_threshold:
-                    error_index = index
-                    break
-        checked_names.append(op_output_var_name)
-        if error_index != None:
-            error_values1 = values1
-            error_values2 = values2
-            break
-    if error_index == None:
-        for name in fetch_names:
-            if name not in checked_names:
-                error_index = -1
-                break
-    if error_index == None:
-        pp_green("outputs are all correct", 1)
-    elif error_index == -1:
-        pp_red("outputs are missing")
-    else:
-        error_values1 = np.array(error_values1)
-        error_values2 = np.array(error_values2)
-        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-        pp_red("outputs are incorrect", 1)
-        pp_red("fluid results are : ", 1)
-        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-        pp_yellow("paddle mobile results are : ", 1)
-        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-        if not fuse and not mem_opt:
-            pp_yellow("checking individual ops : ", 1)
-            error_index = None
-            error_values1 = None
-            error_values2 = None
-            checked_names = []
-            fetch_names = []
-            for fetch in fetches:
-                fetch_names.append(fetch.name)
-            for index in op_cache:
-                op_output_var_name, op = op_cache[index]
-                if mem_opt:
-                    found_in_fetch = False
-                    for fetch in fetches:
-                        if op_output_var_name == fetch.name:
-                            found_in_fetch = True
-                            break
-                    if not found_in_fetch:
-                        continue
-                if not op_output_var_name in output_var_cache:
-                    continue
-                if not op_output_var_name in mobile_var_cache:
-                    continue
-                if fuse or mem_opt:
-                    if op_output_var_name not in fetch_names:
-                        continue
-                values1 = output_var_cache[op_output_var_name]
-                values2 = mobile_var_cache[op_output_var_name]
-                shape = get_var_shape(op_output_var_name) if check_shape else []
-                if len(values1) + len(shape) != len(values2):
-                    error_index = index
-                for i in range(len(shape)):
-                    v1 = shape[i]
-                    v2 = values2[i]
-                    if v1 != v2:
-                        error_index = index
-                        break
-                if error_index == None:
-                    for i in range(len(values1)):
-                        v1 = values1[i]
-                        v2 = values2[len(shape) + i]
-                        if abs(v1 - v2) > diff_threshold:
-                            error_index = index
-                            break
-                checked_names.append(op_output_var_name)
-                if error_index != None:
-                    error_values1 = values1
-                    error_values2 = values2
-                    break
-            if error_index == None:
-                for name in fetch_names:
-                    if name not in checked_names:
-                        error_index = -1
-                        break
-            if error_index == None:
-                pp_green("outputs are all correct", 1)
-            elif error_index == -1:
-                pp_red("outputs are missing")
-            else:
-                error_values1 = np.array(error_values1)
-                error_values2 = np.array(error_values2)
-                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
-                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
-                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
-                pp_red("fluid results are : ", 1)
-                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
-                pp_yellow("paddle mobile results are : ", 1)
-                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
-    # print(output_var_cache)
-    # print(mobile_var_cache)
-
-def main():
-    # 加载kv
-    feed_kv = load_feed_kv()
-    if feed_kv == None:
-        feed_kv = gen_feed_kv()
-        save_feed_kv(feed_kv)
-        feed_kv = load_feed_kv()
-    # 预测
-    pp_yellow(dot + dot + " checking inference")
-    outputs = run_model(feed_kv=feed_kv)
-    pp_tab("fluid output : {}".format(outputs), 1)
-    # 重新保存模型
-    pp_yellow(dot + dot + " checking model correctness")
-    resave_model(feed_kv=feed_kv)
-    # 输出加密模型
-    encrypt_model()
-    # 输出所有中间结果
-    pp_yellow(dot + dot + " checking output result of every op")
-    save_all_op_output(feed_kv=feed_kv)
-    pp_yellow(dot + dot + " checking fetch info")
-    for fetch in fetches:
-        fetch_name = fetch.name
-        fetch_shape = get_var_shape(fetch_name)
-        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
-    # 输出所有op、var信息
-    info_file = open("info.txt", "w")
-    for i in range(len(ops)):
-        op = ops[i]
-        info_file.write("{}th op: type - {}\n".format(i, op.type))
-        info_file.write("inputs:\n")
-        for var_name in op.input_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-        info_file.write("outputs:\n")
-        for var_name in op.output_arg_names:
-            try:
-                shape = get_var_shape(var_name)
-                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
-                info_file.write("var {} : {}\n".format(var_name, shape_str))
-            except:
-                pass
-    info_file.close()
-    # 开始检查mobile的正确性
-    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
-    sh("rm -rf checked_model")
-    sh("cp -r {} checked_model".format(checked_model_path))
-    push("checked_model")
-    push(feed_path + "/" + last_feed_file_name, "input.txt")
-    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
-    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
-    push(mobile_src_root + "/test/build/test-net")
-    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
-    args = str(len(last_feed_var_shape))
-    for dim in last_feed_var_shape:
-        args += " " + str(dim)
-    if is_lod:
-        args += " 1"
-        args += " " + str(len(last_feed_var_lod))
-        for dim in last_feed_var_lod:
-            args += " " + str(dim)
-    else:
-        args += " 0"
-    args += " " + str(len(output_var_cache))
-    args += " " + str(1 if is_sample_step else 0)
-    if is_sample_step:
-        args += " " + str(sample_step)
-    else:
-        args += " " + str(sample_num)
-    for var_name in output_var_cache.keys():
-        args += " " + var_name
-    args += " " + str(1 if check_shape else 0)
-    # if not fast_check:
-    #     check_mobile_results(args, False, False)
-    #     check_mobile_results(args, False, True)
-    # check_mobile_results(args, True, False)
-    check_mobile_results(args, True, True)
-
-if __name__ == "__main__":
-    main()
diff --git a/mobile/tools/quantification/src/block_desc_local.cpp b/mobile/tools/quantification/src/block_desc_local.cpp
deleted file mode 100644
index 8ad1982c05ed0b1b7c7bec5ef26aa8151f941cf3..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/block_desc_local.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-#include "src/block_desc_local.h"
-#include <algorithm>
-#include <memory>
-#include <vector>
-
-#include "src/framework.pb-c.h"
-
-std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>>
-BlockDesc::Vars() const {
-  return vars_;
-}
-
-BlockDesc::BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc)
-    : index_(desc->idx), parent_index_(desc->idx) {
-  for (int i = 0; i < desc->n_vars; ++i) {
-    PaddleMobile__Framework__Proto__VarDesc *var_desc = desc->vars[i];
-    vars_.emplace_back(std::shared_ptr<paddle_mobile::framework::VarDesc>(
-        new paddle_mobile::framework::VarDesc(var_desc)));
-  }
-
-  std::sort(vars_.begin(), vars_.end(),
-            [](std::shared_ptr<paddle_mobile::framework::VarDesc> left,
-               std::shared_ptr<paddle_mobile::framework::VarDesc> right) {
-              return left->Name() < right->Name();
-            });
-
-  //        for (int j = 0; j < desc->n_ops; ++j) {
-  //            PaddleMobile__Framework__Proto__OpDesc *op_desc = desc->ops[j];
-  //            ops_.emplace_back(new OpDesc(op_desc));
-  //        }
-}
diff --git a/mobile/tools/quantification/src/block_desc_local.h b/mobile/tools/quantification/src/block_desc_local.h
deleted file mode 100644
index 2ee8132af7f21ed0e62678c8da510bfd7fba9dbd..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/block_desc_local.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#ifndef TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
-#define TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
-
-#include <memory>
-#include <vector>
-#include "src/var_desc.h"
-
-class BlockDesc {
- public:
-  friend class Node;
-  friend class ProgramOptimize;
-  BlockDesc() {}
-  explicit BlockDesc(PaddleMobile__Framework__Proto__BlockDesc *desc);
-
-  const int &ID() const { return index_; }
-
-  const bool &MultiThread() const { return multi_thread_; }
-
-  const int &Parent() const { return parent_index_; }
-
-  bool operator==(const BlockDesc &in_block) const {
-    return this->ID() == in_block.ID() && this->Parent() == in_block.Parent();
-  }
-
-  bool operator<(const BlockDesc &in_block) const {
-    return this->ID() < in_block.ID() && this->Parent() < in_block.Parent();
-  }
-
-  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> Vars() const;
-
- private:
-  int index_;
-  bool multi_thread_;
-  int parent_index_;
-  std::vector<std::shared_ptr<paddle_mobile::framework::VarDesc>> vars_;
-};
-
-#endif  // TOOLS_QUANTIFICATION_SRC_BLOCK_DESC_LOCAL_H_
diff --git a/mobile/tools/quantification/src/enforce.h b/mobile/tools/quantification/src/enforce.h
deleted file mode 100644
index 51d2110e32433686d1b3353bc63b92a564a13e9d..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/enforce.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef ENABLE_EXCEPTION
-#include <stdio.h>
-#include <exception>
-#include <string>
-
-#endif
-
-namespace paddle_mobile {
-
-#ifdef ENABLE_EXCEPTION
-struct PaddleMobileException : public std::exception {
-  const std::string exception_prefix = "paddle mobile C++ Exception: \n";
-  std::string message;
-
-  PaddleMobileException(const char *header, const char *detail,
-                        const char *file, const int line) {
-    char buffer[1500];
-    snprintf(buffer, sizeof(buffer),
-             "%s| %s \n| [in file] : %s\n| [on line] : %d\n| [detail]  : %s\n",
-             exception_prefix.c_str(), header, file, line, detail);
-    message = std::string(buffer);
-  }
-  const char *what() const noexcept { return message.c_str(); }
-};
-
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)                                 \
-  {                                                                        \
-    char buffer[1000];                                                     \
-    snprintf(buffer, sizeof(buffer), __VA_ARGS__);                         \
-    std::string detail(buffer);                                            \
-    throw paddle_mobile::PaddleMobileException("Custom Exception", buffer, \
-                                               __FILE__, __LINE__);        \
-  }
-
-#define PADDLE_MOBILE_ENFORCE(stat, ...)                                      \
-  {                                                                           \
-    if (stat) {                                                               \
-    } else {                                                                  \
-      char buffer[1000];                                                      \
-      snprintf(buffer, sizeof(buffer), __VA_ARGS__);                          \
-      std::string detail(buffer);                                             \
-      throw paddle_mobile::PaddleMobileException("paddle-mobile enforce",     \
-                                                 buffer, __FILE__, __LINE__); \
-    }                                                                         \
-  }
-#else
-#define PADDLE_MOBILE_THROW_EXCEPTION(...)
-#define PADDLE_MOBILE_ENFORCE(stat, ...)
-#endif
-
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/src/framework.pb-c.c b/mobile/tools/quantification/src/framework.pb-c.c
deleted file mode 100644
index aed0a6c9c0614da74a82cea8c7aa705978dddafc..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/framework.pb-c.c
+++ /dev/null
@@ -1,1403 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-/* Do not generate deprecated warnings for self */
-#ifndef PROTOBUF_C__NO_DEPRECATED
-#define PROTOBUF_C__NO_DEPRECATED
-#endif
-
-#include "framework.pb-c.h"
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message) {
-  static const PaddleMobile__Framework__Proto__OpDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
-                                                 size_t len,
-                                                 const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__op_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Var init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message) {
-  static const PaddleMobile__Framework__Proto__OpProto__Attr init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message) {
-  static const PaddleMobile__Framework__Proto__OpProto init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__OpProto *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__op_proto__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__op_proto__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__TensorDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-      init_value =
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ReaderDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarType__ChannelDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message) {
-  static const PaddleMobile__Framework__Proto__VarType__Tuple init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT;
-  *message = init_value;
-}
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message) {
-  static const PaddleMobile__Framework__Proto__VarType init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarType *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__var_type__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_type__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message) {
-  static const PaddleMobile__Framework__Proto__VarDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__VarDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__var_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__var_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message) {
-  static const PaddleMobile__Framework__Proto__BlockDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__BlockDesc *)protobuf_c_message_unpack(
-      &paddle_mobile__framework__proto__block_desc__descriptor, allocator, len,
-      data);
-}
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__block_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  static const PaddleMobile__Framework__Proto__ProgramDesc init_value =
-      PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT;
-  *message = init_value;
-}
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message) {
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  return protobuf_c_message_get_packed_size(
-      (const ProtobufCMessage *)(message));
-}
-
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
-  return (PaddleMobile__Framework__Proto__ProgramDesc *)
-      protobuf_c_message_unpack(
-          &paddle_mobile__framework__proto__program_desc__descriptor, allocator,
-          len, data);
-}
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    ProtobufCAllocator *allocator) {
-  if (!message) return;
-  assert(message->base.descriptor ==
-         &paddle_mobile__framework__proto__program_desc__descriptor);
-  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
-}
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "i", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_i),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, i), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "f", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_f),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, f), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "s", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, s), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ints", 6, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_ints),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, ints), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "floats", 7, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_FLOAT,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_floats),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, floats),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "strings", 8, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_strings),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, strings),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "b", 10, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_b),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, b), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "bools", 11, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, n_bools),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, bools), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "block_idx", 12, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
-                     has_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, block_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "l", 13, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, has_l),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, l), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
-        8,  /* field[8] = b */
-        10, /* field[10] = block_idx */
-        9,  /* field[9] = bools */
-        3,  /* field[3] = f */
-        6,  /* field[6] = floats */
-        2,  /* field[2] = i */
-        5,  /* field[5] = ints */
-        11, /* field[11] = l */
-        0,  /* field[0] = name */
-        4,  /* field[4] = s */
-        7,  /* field[7] = strings */
-        1,  /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 12}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpDesc__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        12,
-        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
-        2,
-        paddle_mobile__framework__proto__op_desc__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__var__field_descriptors[2] = {
-        {
-            "parameter", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, parameter),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "arguments", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_STRING,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, n_arguments),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc__Var, arguments),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__var__field_indices_by_name[] = {
-        1, /* field[1] = arguments */
-        0, /* field[0] = parameter */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpDesc__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc__Var),
-        2,
-        paddle_mobile__framework__proto__op_desc__var__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_desc__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_desc__is_target__default_value = 0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__field_descriptors[5] = {
-        {
-            "inputs", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, inputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, outputs),
-            &paddle_mobile__framework__proto__op_desc__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, attrs),
-            &paddle_mobile__framework__proto__op_desc__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "is_target", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, has_is_target),
-            offsetof(PaddleMobile__Framework__Proto__OpDesc, is_target), NULL,
-            &paddle_mobile__framework__proto__op_desc__is_target__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_desc__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        0, /* field[0] = inputs */
-        4, /* field[4] = is_target */
-        1, /* field[1] = outputs */
-        2, /* field[2] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                      {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpDesc",
-        "OpDesc",
-        "PaddleMobile__Framework__Proto__OpDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpDesc),
-        5,
-        paddle_mobile__framework__proto__op_desc__field_descriptors,
-        paddle_mobile__framework__proto__op_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__duplicable__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__intermediate__default_value =
-        0;
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "duplicable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_duplicable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, duplicable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__duplicable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "intermediate", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_intermediate),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     intermediate),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__intermediate__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "dispensable", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var,
-                     has_dispensable),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, dispensable),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__var__dispensable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
-        1, /* field[1] = comment */
-        4, /* field[4] = dispensable */
-        2, /* field[2] = duplicable */
-        3, /* field[3] = intermediate */
-        0, /* field[0] = name */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Var",
-        "Var",
-        "PaddleMobile__Framework__Proto__OpProto__Var",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        5,
-        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__var__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__var__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__op_proto__attr__generated__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__field_descriptors[4] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, name), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, type),
-            &paddle_mobile__framework__proto__attr_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 3, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, comment),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "generated", 4, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr,
-                     has_generated),
-            offsetof(PaddleMobile__Framework__Proto__OpProto__Attr, generated),
-            NULL,
-            &paddle_mobile__framework__proto__op_proto__attr__generated__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name[] = {
-        2, /* field[2] = comment */
-        3, /* field[3] = generated */
-        0, /* field[0] = name */
-        1, /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__attr__number_ranges[1 + 1] = {
-        {1, 0}, {0, 4}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto.Attr",
-        "Attr",
-        "PaddleMobile__Framework__Proto__OpProto__Attr",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto__Attr),
-        4,
-        paddle_mobile__framework__proto__op_proto__attr__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__attr__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__attr__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__op_proto__attr__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__field_descriptors[5] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, type), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "inputs", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_inputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, inputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "outputs", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_outputs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, outputs),
-            &paddle_mobile__framework__proto__op_proto__var__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "attrs", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__OpProto, n_attrs),
-            offsetof(PaddleMobile__Framework__Proto__OpProto, attrs),
-            &paddle_mobile__framework__proto__op_proto__attr__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "comment", 5, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__OpProto, comment), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__op_proto__field_indices_by_name[] = {
-        3, /* field[3] = attrs */
-        4, /* field[4] = comment */
-        1, /* field[1] = inputs */
-        2, /* field[2] = outputs */
-        0, /* field[0] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__op_proto__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.OpProto",
-        "OpProto",
-        "PaddleMobile__Framework__Proto__OpProto",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__OpProto),
-        5,
-        paddle_mobile__framework__proto__op_proto__field_descriptors,
-        paddle_mobile__framework__proto__op_proto__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__op_proto__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__op_proto__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "dims", 2, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT64,
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         n_dims),
-                offsetof(PaddleMobile__Framework__Proto__VarType__TensorDesc,
-                         dims),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = data_type */
-            1, /* field[1] = dims */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.TensorDesc",
-        "TensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__TensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__TensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         has_lod_level),
-                offsetof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc,
-                         lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorDesc",
-        "LoDTensorDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value =
-        0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors
-        [2] = {
-            {
-                "tensor", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-                0, /* quantifier_offset */
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    tensor),
-                &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "lod_level", 2, PROTOBUF_C_LABEL_OPTIONAL,
-                PROTOBUF_C_TYPE_INT32,
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    has_lod_level),
-                offsetof(
-                    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc,
-                    lod_level),
-                NULL,
-                &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__lod_level__default_value,
-                0,            /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = lod_level */
-            0, /* field[0] = tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges
-        [1 + 1] = {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.LoDTensorArrayDesc",
-        "LoDTensorArrayDesc",
-        "PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors[1] = {
-        {
-            "lod_tensor", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     n_lod_tensor),
-            offsetof(PaddleMobile__Framework__Proto__VarType__ReaderDesc,
-                     lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name
-        [] = {
-            0, /* field[0] = lod_tensor */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__reader_desc__number_ranges[1 +
-                                                                          1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ReaderDesc",
-        "ReaderDesc",
-        "PaddleMobile__Framework__Proto__VarType__ReaderDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ReaderDesc),
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__reader_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__reader_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__reader_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors
-        [2] = {
-            {
-                "data_type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         data_type),
-                &paddle_mobile__framework__proto__var_type__type__descriptor,
-                NULL, 0,      /* flags */
-                0, NULL, NULL /* reserved1,reserved2, etc */
-            },
-            {
-                "capacity", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT64,
-                0, /* quantifier_offset */
-                offsetof(PaddleMobile__Framework__Proto__VarType__ChannelDesc,
-                         capacity),
-                NULL, NULL, 0, /* flags */
-                0, NULL, NULL  /* reserved1,reserved2, etc */
-            },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name
-        [] = {
-            1, /* field[1] = capacity */
-            0, /* field[0] = data_type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__channel_desc__number_ranges[1 +
-                                                                           1] =
-        {{1, 0}, {0, 2}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.ChannelDesc",
-        "ChannelDesc",
-        "PaddleMobile__Framework__Proto__VarType__ChannelDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__ChannelDesc),
-        2,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_type__channel_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__channel_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__channel_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__field_descriptors[1] = {
-        {
-            "element_type", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_ENUM,
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     n_element_type),
-            offsetof(PaddleMobile__Framework__Proto__VarType__Tuple,
-                     element_type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name[] =
-        {
-            0, /* field[0] = element_type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__tuple__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Tuple",
-        "Tuple",
-        "PaddleMobile__Framework__Proto__VarType__Tuple",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType__Tuple),
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__field_descriptors,
-        paddle_mobile__framework__proto__var_type__tuple__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__tuple__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__var_type__tuple__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
-        {
-            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
-             0},
-            {"INT16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16",
-             1},
-            {"INT32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32",
-             2},
-            {"INT64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64",
-             3},
-            {"FP16", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16",
-             4},
-            {"FP32", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32",
-             5},
-            {"FP64", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64",
-             6},
-            {"LOD_TENSOR",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR", 7},
-            {"SELECTED_ROWS",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS",
-             8},
-            {"FEED_MINIBATCH",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH",
-             9},
-            {"FETCH_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST", 10},
-            {"STEP_SCOPES",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES",
-             11},
-            {"LOD_RANK_TABLE",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE",
-             12},
-            {"LOD_TENSOR_ARRAY",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_"
-             "ARRAY",
-             13},
-            {"PLACE_LIST",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST", 14},
-            {"READER",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER", 15},
-            {"CHANNEL",
-             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL", 16},
-            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
-            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
-             18},
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 19}};
-static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
-        {"BOOL", 0},
-        {"CHANNEL", 16},
-        {"FEED_MINIBATCH", 9},
-        {"FETCH_LIST", 10},
-        {"FP16", 4},
-        {"FP32", 5},
-        {"FP64", 6},
-        {"INT16", 1},
-        {"INT32", 2},
-        {"INT64", 3},
-        {"LOD_RANK_TABLE", 12},
-        {"LOD_TENSOR", 7},
-        {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},
-        {"RAW", 17},
-        {"READER", 15},
-        {"SELECTED_ROWS", 8},
-        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},
-};
-const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType.Type",
-        "Type",
-        "PaddleMobile__Framework__Proto__VarType__Type",
-        "paddle_mobile.framework.proto",
-        19,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        19,
-        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_type__field_descriptors[7] = {
-        {
-            "type", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_ENUM,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, type),
-            &paddle_mobile__framework__proto__var_type__type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "selected_rows", 2, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, selected_rows),
-            &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "lod_tensor", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, lod_tensor),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tensor_array", 4, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_MESSAGE, 0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tensor_array),
-            &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "reader", 5, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, reader),
-            &paddle_mobile__framework__proto__var_type__reader_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "channel", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, channel),
-            &paddle_mobile__framework__proto__var_type__channel_desc__descriptor,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "tuple", 7, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarType, tuple),
-            &paddle_mobile__framework__proto__var_type__tuple__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_type__field_indices_by_name[] = {
-        5, /* field[5] = channel */
-        2, /* field[2] = lod_tensor */
-        4, /* field[4] = reader */
-        1, /* field[1] = selected_rows */
-        3, /* field[3] = tensor_array */
-        6, /* field[6] = tuple */
-        0, /* field[0] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_type__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 7}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarType",
-        "VarType",
-        "PaddleMobile__Framework__Proto__VarType",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarType),
-        7,
-        paddle_mobile__framework__proto__var_type__field_descriptors,
-        paddle_mobile__framework__proto__var_type__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_type__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_type__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const protobuf_c_boolean
-    paddle_mobile__framework__proto__var_desc__persistable__default_value = 0;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__var_desc__field_descriptors[3] = {
-        {
-            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, name), NULL, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "type", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_MESSAGE,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, type),
-            &paddle_mobile__framework__proto__var_type__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "persistable", 3, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_BOOL,
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, has_persistable),
-            offsetof(PaddleMobile__Framework__Proto__VarDesc, persistable),
-            NULL,
-            &paddle_mobile__framework__proto__var_desc__persistable__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__var_desc__field_indices_by_name[] = {
-        0, /* field[0] = name */
-        2, /* field[2] = persistable */
-        1, /* field[1] = type */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__var_desc__number_ranges[1 + 1] = {{1, 0},
-                                                                       {0, 3}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.VarDesc",
-        "VarDesc",
-        "PaddleMobile__Framework__Proto__VarDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__VarDesc),
-        3,
-        paddle_mobile__framework__proto__var_desc__field_descriptors,
-        paddle_mobile__framework__proto__var_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__var_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__var_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const int32_t
-    paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value =
-        -1;
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__block_desc__field_descriptors[5] = {
-        {
-            "idx", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, idx), NULL,
-            NULL, 0,      /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "parent_idx", 2, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_INT32,
-            0, /* quantifier_offset */
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, parent_idx),
-            NULL, NULL, 0, /* flags */
-            0, NULL, NULL  /* reserved1,reserved2, etc */
-        },
-        {
-            "vars", 3, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_vars),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, vars),
-            &paddle_mobile__framework__proto__var_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "ops", 4, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, n_ops),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc, ops),
-            &paddle_mobile__framework__proto__op_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-        {
-            "forward_block_idx", 5, PROTOBUF_C_LABEL_OPTIONAL,
-            PROTOBUF_C_TYPE_INT32,
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     has_forward_block_idx),
-            offsetof(PaddleMobile__Framework__Proto__BlockDesc,
-                     forward_block_idx),
-            NULL,
-            &paddle_mobile__framework__proto__block_desc__forward_block_idx__default_value,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__block_desc__field_indices_by_name[] = {
-        4, /* field[4] = forward_block_idx */
-        0, /* field[0] = idx */
-        3, /* field[3] = ops */
-        1, /* field[1] = parent_idx */
-        2, /* field[2] = vars */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__block_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.BlockDesc",
-        "BlockDesc",
-        "PaddleMobile__Framework__Proto__BlockDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__BlockDesc),
-        5,
-        paddle_mobile__framework__proto__block_desc__field_descriptors,
-        paddle_mobile__framework__proto__block_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__block_desc__number_ranges,
-        (ProtobufCMessageInit)paddle_mobile__framework__proto__block_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
-        {
-            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
-            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, blocks),
-            &paddle_mobile__framework__proto__block_desc__descriptor, NULL,
-            0,            /* flags */
-            0, NULL, NULL /* reserved1,reserved2, etc */
-        },
-};
-static const unsigned
-    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
-        0, /* field[0] = blocks */
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
-const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor = {
-        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.ProgramDesc",
-        "ProgramDesc",
-        "PaddleMobile__Framework__Proto__ProgramDesc",
-        "paddle_mobile.framework.proto",
-        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        1,
-        paddle_mobile__framework__proto__program_desc__field_descriptors,
-        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
-        1,
-        paddle_mobile__framework__proto__program_desc__number_ranges,
-        (ProtobufCMessageInit)
-            paddle_mobile__framework__proto__program_desc__init,
-        NULL,
-        NULL,
-        NULL /* reserved[123] */
-};
-static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
-        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
-        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
-        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
-        {"INTS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS", 3},
-        {"FLOATS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS", 4},
-        {"STRINGS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS", 5},
-        {"BOOLEAN", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN", 6},
-        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
-        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
-        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
-};
-static const ProtobufCIntRange
-    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 10}};
-static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
-        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
-        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
-        {"STRING", 2}, {"STRINGS", 5},
-};
-const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor = {
-        PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC,
-        "paddle_mobile.framework.proto.AttrType",
-        "AttrType",
-        "PaddleMobile__Framework__Proto__AttrType",
-        "paddle_mobile.framework.proto",
-        10,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        10,
-        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
-        1,
-        paddle_mobile__framework__proto__attr_type__value_ranges,
-        NULL,
-        NULL,
-        NULL,
-        NULL /* reserved[1234] */
-};
diff --git a/mobile/tools/quantification/src/framework.pb-c.h b/mobile/tools/quantification/src/framework.pb-c.h
deleted file mode 100644
index 3d63bad76ad188d02986971bd911d8f30cf0af81..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/framework.pb-c.h
+++ /dev/null
@@ -1,579 +0,0 @@
-/* Generated by the protocol buffer compiler.  DO NOT EDIT! */
-/* Generated from: framework.proto */
-
-#ifndef PROTOBUF_C_framework_2eproto__INCLUDED
-#define PROTOBUF_C_framework_2eproto__INCLUDED
-
-#include "protobuf-c.h"
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if PROTOBUF_C_VERSION_NUMBER < 1000000
-# error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
-# error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
-#endif
-
-typedef struct _PaddleMobile__Framework__Proto__OpDesc
-    PaddleMobile__Framework__Proto__OpDesc;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
-    PaddleMobile__Framework__Proto__OpDesc__Attr;
-typedef struct _PaddleMobile__Framework__Proto__OpDesc__Var
-    PaddleMobile__Framework__Proto__OpDesc__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto
-    PaddleMobile__Framework__Proto__OpProto;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Var
-    PaddleMobile__Framework__Proto__OpProto__Var;
-typedef struct _PaddleMobile__Framework__Proto__OpProto__Attr
-    PaddleMobile__Framework__Proto__OpProto__Attr;
-typedef struct _PaddleMobile__Framework__Proto__VarType
-    PaddleMobile__Framework__Proto__VarType;
-typedef struct _PaddleMobile__Framework__Proto__VarType__TensorDesc
-    PaddleMobile__Framework__Proto__VarType__TensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc;
-typedef struct _PaddleMobile__Framework__Proto__VarType__Tuple
-    PaddleMobile__Framework__Proto__VarType__Tuple;
-typedef struct _PaddleMobile__Framework__Proto__VarDesc
-    PaddleMobile__Framework__Proto__VarDesc;
-typedef struct _PaddleMobile__Framework__Proto__BlockDesc
-    PaddleMobile__Framework__Proto__BlockDesc;
-typedef struct _PaddleMobile__Framework__Proto__ProgramDesc
-    PaddleMobile__Framework__Proto__ProgramDesc;
-
-/* --- enums --- */
-
-typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
-  /*
-   * Pod Types
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT16 = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT32 = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT64 = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
-  /*
-   * Other types that may need additional descriptions
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SELECTED_ROWS = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FEED_MINIBATCH = 9,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FETCH_LIST = 10,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__STEP_SCOPES = 11,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_RANK_TABLE = 12,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__LOD_TENSOR_ARRAY = 13,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__PLACE_LIST = 14,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__READER = 15,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__CHANNEL = 16,
-  /*
-   * Any runtime decided variable type is raw
-   * raw variables should manage their own allocations
-   * in operators like nccl_op
-   */
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW = 17,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE =
-      18 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE)
-} PaddleMobile__Framework__Proto__VarType__Type;
-typedef enum _PaddleMobile__Framework__Proto__AttrType {
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT = 0,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT = 1,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING = 2,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INTS = 3,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOATS = 4,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRINGS = 5,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
-      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
-          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
-} PaddleMobile__Framework__Proto__AttrType;
-
-/* --- messages --- */
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  protobuf_c_boolean has_i;
-  int32_t i;
-  protobuf_c_boolean has_f;
-  float f;
-  char *s;
-  size_t n_ints;
-  int32_t *ints;
-  size_t n_floats;
-  float *floats;
-  size_t n_strings;
-  char **strings;
-  protobuf_c_boolean has_b;
-  protobuf_c_boolean b;
-  size_t n_bools;
-  protobuf_c_boolean *bools;
-  protobuf_c_boolean has_block_idx;
-  int32_t block_idx;
-  protobuf_c_boolean has_l;
-  int64_t l;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
-  }
-
-struct _PaddleMobile__Framework__Proto__OpDesc__Var {
-  ProtobufCMessage base;
-  char *parameter;
-  size_t n_arguments;
-  char **arguments;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__VAR__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__op_desc__var__descriptor) \
-    , NULL, 0, NULL                                                 \
-  }
-
-/*
- * OpDesc describes an instance of a C++ framework::OperatorBase
- * derived class type.
- */
-struct _PaddleMobile__Framework__Proto__OpDesc {
-  ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpDesc__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpDesc__Attr **attrs;
-  protobuf_c_boolean has_is_target;
-  protobuf_c_boolean is_target;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__INIT         \
-  {                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                   \
-        &paddle_mobile__framework__proto__op_desc__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, 0, 0                    \
-  }
-
-/*
- * VarProto describes the C++ type framework::Variable.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Var {
-  ProtobufCMessage base;
-  char *name;
-  char *comment;
-  protobuf_c_boolean has_duplicable;
-  protobuf_c_boolean duplicable;
-  protobuf_c_boolean has_intermediate;
-  protobuf_c_boolean intermediate;
-  protobuf_c_boolean has_dispensable;
-  protobuf_c_boolean dispensable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
-  {                                                                  \
-    PROTOBUF_C_MESSAGE_INIT(                                         \
-        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
-  }
-
-/*
- * AttrProto describes the C++ type Attribute.
- */
-struct _PaddleMobile__Framework__Proto__OpProto__Attr {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__AttrType type;
-  char *comment;
-  /*
-   * If that attribute is generated, it means the Paddle third
-   * language binding has responsibility to fill that
-   * attribute. End-User should not set that attribute.
-   */
-  protobuf_c_boolean has_generated;
-  protobuf_c_boolean generated;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__ATTR__INIT           \
-  {                                                                     \
-    PROTOBUF_C_MESSAGE_INIT(                                            \
-        &paddle_mobile__framework__proto__op_proto__attr__descriptor)   \
-    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, NULL, 0, 0 \
-  }
-
-/*
- * OpProto describes a C++ framework::OperatorBase derived class.
- */
-struct _PaddleMobile__Framework__Proto__OpProto {
-  ProtobufCMessage base;
-  char *type;
-  size_t n_inputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **inputs;
-  size_t n_outputs;
-  PaddleMobile__Framework__Proto__OpProto__Var **outputs;
-  size_t n_attrs;
-  PaddleMobile__Framework__Proto__OpProto__Attr **attrs;
-  char *comment;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__op_proto__descriptor) \
-    , NULL, 0, NULL, 0, NULL, 0, NULL, NULL                     \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__TensorDesc {
-  ProtobufCMessage base;
-  /*
-   * Should only be PODType. Is enforced in C++
-   */
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  /*
-   * [UNK, 640, 480] is saved as [-1, 640, 480]
-   */
-  size_t n_dims;
-  int64_t *dims;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TENSOR_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__tensor_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0, NULL         \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_DESC__INIT         \
-  {                                                                              \
-    PROTOBUF_C_MESSAGE_INIT(                                                     \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor) \
-    , NULL, 0, 0                                                                 \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *tensor;
-  protobuf_c_boolean has_lod_level;
-  int32_t lod_level;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__LO_DTENSOR_ARRAY_DESC__INIT         \
-  {                                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                                           \
-        &paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor) \
-    , NULL, 0, 0                                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ReaderDesc {
-  ProtobufCMessage base;
-  size_t n_lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc **lod_tensor;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__READER_DESC__INIT         \
-  {                                                                          \
-    PROTOBUF_C_MESSAGE_INIT(                                                 \
-        &paddle_mobile__framework__proto__var_type__reader_desc__descriptor) \
-    , 0, NULL                                                                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__ChannelDesc {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type data_type;
-  int64_t capacity;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__CHANNEL_DESC__INIT         \
-  {                                                                           \
-    PROTOBUF_C_MESSAGE_INIT(                                                  \
-        &paddle_mobile__framework__proto__var_type__channel_desc__descriptor) \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, 0                \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType__Tuple {
-  ProtobufCMessage base;
-  size_t n_element_type;
-  PaddleMobile__Framework__Proto__VarType__Type *element_type;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TUPLE__INIT         \
-  {                                                                    \
-    PROTOBUF_C_MESSAGE_INIT(                                           \
-        &paddle_mobile__framework__proto__var_type__tuple__descriptor) \
-    , 0, NULL                                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarType {
-  ProtobufCMessage base;
-  PaddleMobile__Framework__Proto__VarType__Type type;
-  PaddleMobile__Framework__Proto__VarType__TensorDesc *selected_rows;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *lod_tensor;
-  PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *tensor_array;
-  PaddleMobile__Framework__Proto__VarType__ReaderDesc *reader;
-  PaddleMobile__Framework__Proto__VarType__ChannelDesc *channel;
-  PaddleMobile__Framework__Proto__VarType__Tuple *tuple;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__INIT                        \
-  {                                                                            \
-    PROTOBUF_C_MESSAGE_INIT(                                                   \
-        &paddle_mobile__framework__proto__var_type__descriptor)                \
-    , PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL, NULL, NULL, NULL, \
-        NULL, NULL, NULL                                                       \
-  }
-
-struct _PaddleMobile__Framework__Proto__VarDesc {
-  ProtobufCMessage base;
-  char *name;
-  PaddleMobile__Framework__Proto__VarType *type;
-  protobuf_c_boolean has_persistable;
-  protobuf_c_boolean persistable;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_DESC__INIT         \
-  {                                                             \
-    PROTOBUF_C_MESSAGE_INIT(                                    \
-        &paddle_mobile__framework__proto__var_desc__descriptor) \
-    , NULL, NULL, 0, 0                                          \
-  }
-
-struct _PaddleMobile__Framework__Proto__BlockDesc {
-  ProtobufCMessage base;
-  int32_t idx;
-  int32_t parent_idx;
-  size_t n_vars;
-  PaddleMobile__Framework__Proto__VarDesc **vars;
-  size_t n_ops;
-  PaddleMobile__Framework__Proto__OpDesc **ops;
-  protobuf_c_boolean has_forward_block_idx;
-  int32_t forward_block_idx;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__BLOCK_DESC__INIT         \
-  {                                                               \
-    PROTOBUF_C_MESSAGE_INIT(                                      \
-        &paddle_mobile__framework__proto__block_desc__descriptor) \
-    , 0, 0, 0, NULL, 0, NULL, 0, -1                               \
-  }
-
-/*
- * Please refer to
- * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
- * for more details.
- * TODO(panyx0718): A model can have multiple programs. Need a
- * way to distinguish them. Maybe ID or name?
- */
-struct _PaddleMobile__Framework__Proto__ProgramDesc {
-  ProtobufCMessage base;
-  size_t n_blocks;
-  PaddleMobile__Framework__Proto__BlockDesc **blocks;
-};
-#define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
-  {                                                                 \
-    PROTOBUF_C_MESSAGE_INIT(                                        \
-        &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL                                                       \
-  }
-
-/* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
-void paddle_mobile__framework__proto__op_desc__attr__init(
-    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
-/* PaddleMobile__Framework__Proto__OpDesc__Var methods */
-void paddle_mobile__framework__proto__op_desc__var__init(
-    PaddleMobile__Framework__Proto__OpDesc__Var *message);
-/* PaddleMobile__Framework__Proto__OpDesc methods */
-void paddle_mobile__framework__proto__op_desc__init(
-    PaddleMobile__Framework__Proto__OpDesc *message);
-
-size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpDesc *message);
-
-PaddleMobile__Framework__Proto__OpDesc *
-paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
-                                                 size_t len,
-                                                 const uint8_t *data);
-void paddle_mobile__framework__proto__op_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__OpDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__OpProto__Var methods */
-void paddle_mobile__framework__proto__op_proto__var__init(
-    PaddleMobile__Framework__Proto__OpProto__Var *message);
-/* PaddleMobile__Framework__Proto__OpProto__Attr methods */
-void paddle_mobile__framework__proto__op_proto__attr__init(
-    PaddleMobile__Framework__Proto__OpProto__Attr *message);
-/* PaddleMobile__Framework__Proto__OpProto methods */
-void paddle_mobile__framework__proto__op_proto__init(
-    PaddleMobile__Framework__Proto__OpProto *message);
-size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
-    const PaddleMobile__Framework__Proto__OpProto *message);
-PaddleMobile__Framework__Proto__OpProto *
-paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__op_proto__free_unpacked(
-    PaddleMobile__Framework__Proto__OpProto *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarType__TensorDesc methods */
-void paddle_mobile__framework__proto__var_type__tensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__TensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc methods */
-void paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__init(
-    PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ReaderDesc methods */
-void paddle_mobile__framework__proto__var_type__reader_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ReaderDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__ChannelDesc methods */
-void paddle_mobile__framework__proto__var_type__channel_desc__init(
-    PaddleMobile__Framework__Proto__VarType__ChannelDesc *message);
-/* PaddleMobile__Framework__Proto__VarType__Tuple methods */
-void paddle_mobile__framework__proto__var_type__tuple__init(
-    PaddleMobile__Framework__Proto__VarType__Tuple *message);
-/* PaddleMobile__Framework__Proto__VarType methods */
-void paddle_mobile__framework__proto__var_type__init(
-    PaddleMobile__Framework__Proto__VarType *message);
-size_t paddle_mobile__framework__proto__var_type__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarType *message);
-PaddleMobile__Framework__Proto__VarType *
-paddle_mobile__framework__proto__var_type__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__var_type__free_unpacked(
-    PaddleMobile__Framework__Proto__VarType *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__VarDesc methods */
-void paddle_mobile__framework__proto__var_desc__init(
-    PaddleMobile__Framework__Proto__VarDesc *message);
-size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__VarDesc *message);
-PaddleMobile__Framework__Proto__VarDesc *
-paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
-                                                  size_t len,
-                                                  const uint8_t *data);
-void paddle_mobile__framework__proto__var_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__VarDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__BlockDesc methods */
-void paddle_mobile__framework__proto__block_desc__init(
-    PaddleMobile__Framework__Proto__BlockDesc *message);
-size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__BlockDesc *message);
-PaddleMobile__Framework__Proto__BlockDesc *
-paddle_mobile__framework__proto__block_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
-void paddle_mobile__framework__proto__block_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__BlockDesc *message,
-    ProtobufCAllocator *allocator);
-/* PaddleMobile__Framework__Proto__ProgramDesc methods */
-void paddle_mobile__framework__proto__program_desc__init(
-    PaddleMobile__Framework__Proto__ProgramDesc *message);
-size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message);
-PaddleMobile__Framework__Proto__ProgramDesc *
-paddle_mobile__framework__proto__program_desc__unpack(
-    ProtobufCAllocator *allocator, size_t len, const uint8_t *data);
-void paddle_mobile__framework__proto__program_desc__free_unpacked(
-    PaddleMobile__Framework__Proto__ProgramDesc *message,
-    ProtobufCAllocator *allocator);
-/* --- per-message closures --- */
-
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpDesc_Closure)(
-    const PaddleMobile__Framework__Proto__OpDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Var_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Var *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto__Attr_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto__Attr *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__OpProto_Closure)(
-    const PaddleMobile__Framework__Proto__OpProto *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__TensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__TensorDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__LoDTensorDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorDesc *message,
-    void *closure_data);
-typedef void (
-    *PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__LoDTensorArrayDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ReaderDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ReaderDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__ChannelDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__ChannelDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType__Tuple_Closure)(
-    const PaddleMobile__Framework__Proto__VarType__Tuple *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarType_Closure)(
-    const PaddleMobile__Framework__Proto__VarType *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__VarDesc_Closure)(
-    const PaddleMobile__Framework__Proto__VarDesc *message, void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__BlockDesc_Closure)(
-    const PaddleMobile__Framework__Proto__BlockDesc *message,
-    void *closure_data);
-typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
-    const PaddleMobile__Framework__Proto__ProgramDesc *message,
-    void *closure_data);
-
-/* --- services --- */
-
-/* --- descriptors --- */
-
-extern const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__attr_type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_desc__var__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__var__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__op_proto__attr__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tensor_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__lo_dtensor_array_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__reader_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__channel_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_type__tuple__descriptor;
-extern const ProtobufCEnumDescriptor
-    paddle_mobile__framework__proto__var_type__type__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__var_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__block_desc__descriptor;
-extern const ProtobufCMessageDescriptor
-    paddle_mobile__framework__proto__program_desc__descriptor;
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_framework_2eproto__INCLUDED */
diff --git a/mobile/tools/quantification/src/program_desc.cpp b/mobile/tools/quantification/src/program_desc.cpp
deleted file mode 100644
index 4f9984832ada5061c7691aeb7fadba86cb5b8c0c..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/program_desc.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#include "src/program_desc.h"
-#include <vector>
-
-ProgramDesc::ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc) {
-  for (int i = 0; i < desc->n_blocks; ++i) {
-    blocks_.emplace_back(std::make_shared<BlockDesc>(desc->blocks[i]));
-  }
-}
-
-const std::vector<std::shared_ptr<BlockDesc>> ProgramDesc::Blocks() {
-  return blocks_;
-}
diff --git a/mobile/tools/quantification/src/program_desc.h b/mobile/tools/quantification/src/program_desc.h
deleted file mode 100644
index 60a0f757b0c907165d7639a41e35a407ef083b59..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/program_desc.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-//
-// Created by 谢柏渊 on 2018/7/25.
-//
-
-#ifndef TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
-#define TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
-
-#include <memory>
-#include <vector>
-#include "src/block_desc_local.h"
-#include "src/framework.pb-c.h"
-
-class ProgramDesc {
- public:
-  //    friend class Node;
-  //
-  //    friend class ProgramOptimize;
-
-  explicit ProgramDesc(PaddleMobile__Framework__Proto__ProgramDesc *desc);
-
-  const std::vector<std::shared_ptr<BlockDesc>> Blocks();
-
- private:
-  std::vector<std::shared_ptr<BlockDesc>> blocks_;
-};
-
-#endif  // TOOLS_QUANTIFICATION_SRC_PROGRAM_DESC_H_
diff --git a/mobile/tools/quantification/src/protobuf-c.c b/mobile/tools/quantification/src/protobuf-c.c
deleted file mode 100644
index 1092e3f78b02a343d8c8965ea7b2d777a6fac9ae..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/protobuf-c.c
+++ /dev/null
@@ -1,2098 +0,0 @@
-/*
- * Copyright (c) 2008-2015, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * Support library for `protoc-c` generated code.
- *
- * This file implements the public API used by the code generated
- * by `protoc-c`.
- *
- * \authors Dave Benson and the protobuf-c authors
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- */
-
-/**
- * \todo 64-BIT OPTIMIZATION: certain implementations use 32-bit math
- * even on 64-bit platforms (uint64_size, uint64_pack, parse_uint64).
- *
- * \todo Use size_t consistently.
- */
-
-#include <stdlib.h> /* for malloc, free */
-#include <string.h> /* for strcmp, strlen, memcpy, memmove, memset */
-
-#include "protobuf-c.h"
-
-#define TRUE 1
-#define FALSE 0
-
-#define PROTOBUF_C__ASSERT_NOT_REACHED() assert(0)
-
-/* Workaround for Microsoft compilers. */
-#ifdef _MSC_VER
-#define inline __inline
-#endif
-
-/**
- * \defgroup internal Internal functions and macros
- *
- * These are not exported by the library but are useful to developers working
- * on `libprotobuf-c` itself.
- */
-
-/**
- * \defgroup macros Utility macros for manipulating structures
- *
- * Macros and constants used to manipulate the base "classes" generated by
- * `protobuf-c`. They also define limits and check correctness.
- *
- * \ingroup internal
- * @{
- */
-
-/** The maximum length of a 64-bit integer in varint encoding. */
-#define MAX_UINT64_ENCODED_SIZE 10
-
-#ifndef PROTOBUF_C_UNPACK_ERROR
-#define PROTOBUF_C_UNPACK_ERROR(...)
-#endif
-
-const char protobuf_c_empty_string[] = "";
-
-/**
- * Internal `ProtobufCMessage` manipulation macro.
- *
- * Base macro for manipulating a `ProtobufCMessage`. Used by STRUCT_MEMBER() and
- * STRUCT_MEMBER_PTR().
- */
-#define STRUCT_MEMBER_P(struct_p, struct_offset) \
-  ((void *)((uint8_t *)(struct_p) + (struct_offset)))
-
-/**
- * Return field in a `ProtobufCMessage` based on offset.
- *
- * Take a pointer to a `ProtobufCMessage` and find the field at the offset.
- * Cast it to the passed type.
- */
-#define STRUCT_MEMBER(member_type, struct_p, struct_offset) \
-  (*(member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/**
- * Return field in a `ProtobufCMessage` based on offset.
- *
- * Take a pointer to a `ProtobufCMessage` and find the field at the offset. Cast
- * it to a pointer to the passed type.
- */
-#define STRUCT_MEMBER_PTR(member_type, struct_p, struct_offset) \
-  ((member_type *)STRUCT_MEMBER_P((struct_p), (struct_offset)))
-
-/* Assertions for magic numbers. */
-
-#define ASSERT_IS_ENUM_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC)
-
-#define ASSERT_IS_MESSAGE(message) \
-  ASSERT_IS_MESSAGE_DESCRIPTOR((message)->descriptor)
-
-#define ASSERT_IS_SERVICE_DESCRIPTOR(desc) \
-  assert((desc)->magic == PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC)
-
-/**@}*/
-
-/* --- version --- */
-
-const char *protobuf_c_version(void) { return PROTOBUF_C_VERSION; }
-
-uint32_t protobuf_c_version_number(void) { return PROTOBUF_C_VERSION_NUMBER; }
-
-/* --- allocator --- */
-
-static void *system_alloc(void *allocator_data, size_t size) {
-  return malloc(size);
-}
-
-static void system_free(void *allocator_data, void *data) { free(data); }
-
-static inline void *do_alloc(ProtobufCAllocator *allocator, size_t size) {
-  return allocator->alloc(allocator->allocator_data, size);
-}
-
-static inline void do_free(ProtobufCAllocator *allocator, void *data) {
-  if (data != NULL) allocator->free(allocator->allocator_data, data);
-}
-
-/*
- * This allocator uses the system's malloc() and free(). It is the default
- * allocator used if NULL is passed as the ProtobufCAllocator to an exported
- * function.
- */
-static ProtobufCAllocator protobuf_c__allocator = {
-    .alloc = &system_alloc,
-    .free = &system_free,
-    .allocator_data = NULL,
-};
-
-/* === buffer-simple === */
-
-void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
-                                     const uint8_t *data) {
-  ProtobufCBufferSimple *simp = (ProtobufCBufferSimple *)buffer;
-  size_t new_len = simp->len + len;
-
-  if (new_len > simp->alloced) {
-    ProtobufCAllocator *allocator = simp->allocator;
-    size_t new_alloced = simp->alloced * 2;
-    uint8_t *new_data;
-
-    if (allocator == NULL) allocator = &protobuf_c__allocator;
-    while (new_alloced < new_len) new_alloced += new_alloced;
-    new_data = do_alloc(allocator, new_alloced);
-    if (!new_data) return;
-    memcpy(new_data, simp->data, simp->len);
-    if (simp->must_free_data)
-      do_free(allocator, simp->data);
-    else
-      simp->must_free_data = TRUE;
-    simp->data = new_data;
-    simp->alloced = new_alloced;
-  }
-  memcpy(simp->data + simp->len, data, len);
-  simp->len = new_len;
-}
-
-/**
- * \defgroup packedsz protobuf_c_message_get_packed_size() implementation
- *
- * Routines mainly used by protobuf_c_message_get_packed_size().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Return the number of bytes required to store the tag for the field. Includes
- * 3 bits for the wire-type, and a single bit that denotes the end-of-tag.
- *
- * \param number
- *      Field tag to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t get_tag_size(uint32_t number) {
-  if (number < (1UL << 4)) {
-    return 1;
-  } else if (number < (1UL << 11)) {
-    return 2;
-  } else if (number < (1UL << 18)) {
-    return 3;
-  } else if (number < (1UL << 25)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length unsigned
- * 32-bit integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint32_size(uint32_t v) {
-  if (v < (1UL << 7)) {
-    return 1;
-  } else if (v < (1UL << 14)) {
-    return 2;
-  } else if (v < (1UL << 21)) {
-    return 3;
-  } else if (v < (1UL << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the number of bytes required to store a variable-length signed 32-bit
- * integer in base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t int32_size(int32_t v) {
-  if (v < 0) {
-    return 10;
-  } else if (v < (1L << 7)) {
-    return 1;
-  } else if (v < (1L << 14)) {
-    return 2;
-  } else if (v < (1L << 21)) {
-    return 3;
-  } else if (v < (1L << 28)) {
-    return 4;
-  } else {
-    return 5;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 32-bit unsigned integer form of a 32-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint32_t zigzag32(int32_t v) {
-  if (v < 0)
-    return (-(uint32_t)v) * 2 - 1;
-  else
-    return (uint32_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 32-bit integer,
- * converted to an unsigned 32-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint32_size(int32_t v) { return uint32_size(zigzag32(v)); }
-
-/**
- * Return the number of bytes required to store a 64-bit unsigned integer in
- * base-128 varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t uint64_size(uint64_t v) {
-  uint32_t upper_v = (uint32_t)(v >> 32);
-
-  if (upper_v == 0) {
-    return uint32_size((uint32_t)v);
-  } else if (upper_v < (1UL << 3)) {
-    return 5;
-  } else if (upper_v < (1UL << 10)) {
-    return 6;
-  } else if (upper_v < (1UL << 17)) {
-    return 7;
-  } else if (upper_v < (1UL << 24)) {
-    return 8;
-  } else if (upper_v < (1UL << 31)) {
-    return 9;
-  } else {
-    return 10;
-  }
-}
-
-/**
- * Return the ZigZag-encoded 64-bit unsigned integer form of a 64-bit signed
- * integer.
- *
- * \param v
- *      Value to encode.
- * \return
- *      ZigZag encoded integer.
- */
-static inline uint64_t zigzag64(int64_t v) {
-  if (v < 0)
-    return (-(uint64_t)v) * 2 - 1;
-  else
-    return (uint64_t)(v)*2;
-}
-
-/**
- * Return the number of bytes required to store a signed 64-bit integer,
- * converted to an unsigned 64-bit integer with ZigZag encoding, using base-128
- * varint encoding.
- *
- * \param v
- *      Value to encode.
- * \return
- *      Number of bytes required.
- */
-static inline size_t sint64_size(int64_t v) { return uint64_size(zigzag64(v)); }
-
-/**
- * Calculate the serialized size of a single required message field, including
- * the space needed by the preceding tag.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t required_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  size_t rv = get_tag_size(field->id);
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      return rv + sint32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      return rv + int32_size(*(const int32_t *)member);
-    case PROTOBUF_C_TYPE_UINT32:
-      return rv + uint32_size(*(const uint32_t *)member);
-    case PROTOBUF_C_TYPE_SINT64:
-      return rv + sint64_size(*(const int64_t *)member);
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      return rv + uint64_size(*(const uint64_t *)member);
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return rv + 1;
-    case PROTOBUF_C_TYPE_FLOAT:
-      return rv + 4;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return rv + 8;
-    case PROTOBUF_C_TYPE_STRING: {
-      const char *str = *(char *const *)member;
-      size_t len = str ? strlen(str) : 0;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      size_t len = ((const ProtobufCBinaryData *)member)->len;
-      return rv + uint32_size(len) + len;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      const ProtobufCMessage *msg = *(ProtobufCMessage *const *)member;
-      size_t subrv = msg ? protobuf_c_message_get_packed_size(msg) : 0;
-      return rv + uint32_size(subrv) + subrv;
-    }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Calculate the serialized size of a single oneof message field, including
- * the space needed by the preceding tag. Returns 0 if the oneof field isn't
- * selected or is not set.
- *
- * \param field
- *      Field descriptor for member.
- * \param oneof_case
- *      Enum value that selects the field in the oneof.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t oneof_field_get_packed_size(const ProtobufCFieldDescriptor *field,
-                                          uint32_t oneof_case,
-                                          const void *member) {
-  if (oneof_case != field->id) {
-    return 0;
-  }
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  }
-  return required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of a single optional message field, including
- * the space needed by the preceding tag. Returns 0 if the optional field isn't
- * set.
- *
- * \param field
- *      Field descriptor for member.
- * \param has
- *      True if the field exists, false if not.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t optional_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const protobuf_c_boolean has,
-    const void *member) {
-  if (field->type == PROTOBUF_C_TYPE_MESSAGE ||
-      field->type == PROTOBUF_C_TYPE_STRING) {
-    const void *ptr = *(const void *const *)member;
-    if (ptr == NULL || ptr == field->default_value) return 0;
-  } else {
-    if (!has) return 0;
-  }
-  return required_field_get_packed_size(field, member);
-}
-
-static protobuf_c_boolean field_is_zeroish(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  protobuf_c_boolean ret = FALSE;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_BOOL:
-      ret = (0 == *(const protobuf_c_boolean *)member);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-      ret = (0 == *(const uint32_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-      ret = (0 == *(const uint64_t *)member);
-      break;
-    case PROTOBUF_C_TYPE_FLOAT:
-      ret = (0 == *(const float *)member);
-      break;
-    case PROTOBUF_C_TYPE_DOUBLE:
-      ret = (0 == *(const double *)member);
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      ret = (NULL == *(const char *const *)member) ||
-            ('\0' == **(const char *const *)member);
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      ret = (NULL == *(const void *const *)member);
-      break;
-    default:
-      ret = TRUE;
-      break;
-  }
-
-  return ret;
-}
-
-/**
- * Calculate the serialized size of a single unlabeled message field, including
- * the space needed by the preceding tag. Returns 0 if the field isn't set or
- * if it is set to a "zeroish" value (null pointer or 0 for numerical values).
- * Unlabeled fields are supported only in proto3.
- *
- * \param field
- *      Field descriptor for member.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t unlabeled_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, const void *member) {
-  if (field_is_zeroish(field, member)) return 0;
-  return required_field_get_packed_size(field, member);
-}
-
-/**
- * Calculate the serialized size of repeated message fields, which may consist
- * of any number of values (including 0). Includes the space needed by the
- * preceding tags (as needed).
- *
- * \param field
- *      Field descriptor for member.
- * \param count
- *      Number of repeated field members.
- * \param member
- *      Field to encode.
- * \return
- *      Number of bytes required.
- */
-static size_t repeated_field_get_packed_size(
-    const ProtobufCFieldDescriptor *field, size_t count, const void *member) {
-  size_t header_size;
-  size_t rv = 0;
-  unsigned i;
-  void *array = *(void *const *)member;
-
-  if (count == 0) return 0;
-  header_size = get_tag_size(field->id);
-  if (0 == (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED)) header_size *= count;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SINT32:
-      for (i = 0; i < count; i++) rv += sint32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      for (i = 0; i < count; i++) rv += int32_size(((int32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      for (i = 0; i < count; i++) rv += uint32_size(((uint32_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SINT64:
-      for (i = 0; i < count; i++) rv += sint64_size(((int64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      for (i = 0; i < count; i++) rv += uint64_size(((uint64_t *)array)[i]);
-      break;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      rv += 4 * count;
-      break;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      rv += 8 * count;
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      rv += count;
-      break;
-    case PROTOBUF_C_TYPE_STRING:
-      for (i = 0; i < count; i++) {
-        size_t len = strlen(((char **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BYTES:
-      for (i = 0; i < count; i++) {
-        size_t len = ((ProtobufCBinaryData *)array)[i].len;
-        rv += uint32_size(len) + len;
-      }
-      break;
-    case PROTOBUF_C_TYPE_MESSAGE:
-      for (i = 0; i < count; i++) {
-        size_t len =
-            protobuf_c_message_get_packed_size(((ProtobufCMessage **)array)[i]);
-        rv += uint32_size(len) + len;
-      }
-      break;
-  }
-
-  if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED))
-    header_size += uint32_size(rv);
-  return header_size + rv;
-}
-
-/**
- * Calculate the serialized size of an unknown field, i.e. one that is passed
- * through mostly uninterpreted. This is required for forward compatibility if
- * new fields are added to the message descriptor.
- *
- * \param field
- *      Unknown field type.
- * \return
- *      Number of bytes required.
- */
-static inline size_t unknown_field_get_packed_size(
-    const ProtobufCMessageUnknownField *field) {
-  return get_tag_size(field->tag) + field->len;
-}
-
-/**@}*/
-
-/*
- * Calculate the serialized size of the message.
- */
-size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message) {
-  unsigned i;
-  size_t rv = 0;
-
-  ASSERT_IS_MESSAGE(message);
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const ProtobufCFieldDescriptor *field = message->descriptor->fields + i;
-    const void *member = ((const char *)message) + field->offset;
-    const void *qmember = ((const char *)message) + field->quantifier_offset;
-
-    if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      rv += required_field_get_packed_size(field, member);
-    } else if ((field->label == PROTOBUF_C_LABEL_OPTIONAL ||
-                field->label == PROTOBUF_C_LABEL_NONE) &&
-               (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF))) {
-      rv += oneof_field_get_packed_size(field, *(const uint32_t *)qmember,
-                                        member);
-    } else if (field->label == PROTOBUF_C_LABEL_OPTIONAL) {
-      rv += optional_field_get_packed_size(
-          field, *(protobuf_c_boolean *)qmember, member);
-    } else if (field->label == PROTOBUF_C_LABEL_NONE) {
-      rv += unlabeled_field_get_packed_size(field, member);
-    } else {
-      rv += repeated_field_get_packed_size(field, *(const size_t *)qmember,
-                                           member);
-    }
-  }
-  for (i = 0; i < message->n_unknown_fields; i++)
-    rv += unknown_field_get_packed_size(&message->unknown_fields[i]);
-  return rv;
-}
-
-/**
- * \defgroup pack protobuf_c_message_pack() implementation
- *
- * Routines mainly used by protobuf_c_message_pack().
- *
- * \ingroup internal
- * @{
- */
-
-/**
- * Pack an unsigned 32-bit integer in base-128 varint encoding and return the
- * number of bytes written, which must be 5 or less.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t uint32_pack(uint32_t value, uint8_t *out) {
-  unsigned rv = 0;
-
-  if (value >= 0x80) {
-    out[rv++] = value | 0x80;
-    value >>= 7;
-    if (value >= 0x80) {
-      out[rv++] = value | 0x80;
-      value >>= 7;
-      if (value >= 0x80) {
-        out[rv++] = value | 0x80;
-        value >>= 7;
-        if (value >= 0x80) {
-          out[rv++] = value | 0x80;
-          value >>= 7;
-        }
-      }
-    }
-  }
-  /* assert: value<128 */
-  out[rv++] = value;
-  return rv;
-}
-
-/**
- * Pack a 64-bit unsigned integer using base-128 varint encoding and return the
- * number of bytes written.
- *
- * \param value
- *      Value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t uint64_pack(uint64_t value, uint8_t *out) {
-  uint32_t hi = (uint32_t)(value >> 32);
-  uint32_t lo = (uint32_t)value;
-  unsigned rv;
-
-  if (hi == 0) return uint32_pack((uint32_t)lo, out);
-  out[0] = (lo) | 0x80;
-  out[1] = (lo >> 7) | 0x80;
-  out[2] = (lo >> 14) | 0x80;
-  out[3] = (lo >> 21) | 0x80;
-  if (hi < 8) {
-    out[4] = (hi << 4) | (lo >> 28);
-    return 5;
-  } else {
-    out[4] = ((hi & 7) << 4) | (lo >> 28) | 0x80;
-    hi >>= 3;
-  }
-  rv = 5;
-  while (hi >= 128) {
-    out[rv++] = hi | 0x80;
-    hi >>= 7;
-  }
-  out[rv++] = hi;
-  return rv;
-}
-
-/**
- * Pack a ProtobufCBinaryData and return the number of bytes written. The output
- * includes a length delimiter.
- *
- * \param bd
- *      ProtobufCBinaryData to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static inline size_t binary_data_pack(const ProtobufCBinaryData *bd,
-                                      uint8_t *out) {
-  size_t len = bd->len;
-  size_t rv = uint32_pack(len, out);
-  memcpy(out + rv, bd->data, len);
-  return rv + len;
-}
-
-/**
- * Pack a field tag.
- *
- * Wire-type will be added in required_field_pack().
- *
- * \todo Just call uint64_pack on 64-bit platforms.
- *
- * \param id
- *      Tag value to encode.
- * \param[out] out
- *      Packed value.
- * \return
- *      Number of bytes written to `out`.
- */
-static size_t tag_pack(uint32_t id, uint8_t *out) {
-  if (id < (1UL << (32 - 3)))
-    return uint32_pack(id << 3, out);
-  else
-    return uint64_pack(((uint64_t)id) << 3, out);
-}
-
-/**
- * Given a field type, return the in-memory size.
- *
- * \todo Implement as a table lookup.
- *
- * \param type
- *      Field type.
- * \return
- *      Size of the field.
- */
-static inline size_t sizeof_elt_in_repeated_array(ProtobufCType type) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-    case PROTOBUF_C_TYPE_ENUM:
-      return 4;
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      return 8;
-    case PROTOBUF_C_TYPE_BOOL:
-      return sizeof(protobuf_c_boolean);
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_MESSAGE:
-      return sizeof(void *);
-    case PROTOBUF_C_TYPE_BYTES:
-      return sizeof(ProtobufCBinaryData);
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-static inline int int_range_lookup(unsigned n_ranges,
-                                   const ProtobufCIntRange *ranges, int value) {
-  unsigned n;
-  unsigned start;
-
-  if (n_ranges == 0) return -1;
-  start = 0;
-  n = n_ranges;
-  while (n > 1) {
-    unsigned mid = start + n / 2;
-
-    if (value < ranges[mid].start_value) {
-      n = mid - start;
-    } else if (value >=
-               ranges[mid].start_value +
-                   (int)(ranges[mid + 1].orig_index - ranges[mid].orig_index)) {
-      unsigned new_start = mid + 1;
-      n = start + n - new_start;
-      start = new_start;
-    } else
-      return (value - ranges[mid].start_value) + ranges[mid].orig_index;
-  }
-  if (n > 0) {
-    unsigned start_orig_index = ranges[start].orig_index;
-    unsigned range_size = ranges[start + 1].orig_index - start_orig_index;
-
-    if (ranges[start].start_value <= value &&
-        value < (int)(ranges[start].start_value + range_size)) {
-      return (value - ranges[start].start_value) + start_orig_index;
-    }
-  }
-  return -1;
-}
-
-static size_t parse_tag_and_wiretype(size_t len, const uint8_t *data,
-                                     uint32_t *tag_out,
-                                     ProtobufCWireType *wiretype_out) {
-  unsigned max_rv = len > 5 ? 5 : len;
-  uint32_t tag = (data[0] & 0x7f) >> 3;
-  unsigned shift = 4;
-  unsigned rv;
-
-  *wiretype_out = data[0] & 7;
-  if ((data[0] & 0x80) == 0) {
-    *tag_out = tag;
-    return 1;
-  }
-  for (rv = 1; rv < max_rv; rv++) {
-    if (data[rv] & 0x80) {
-      tag |= (data[rv] & 0x7f) << shift;
-      shift += 7;
-    } else {
-      tag |= data[rv] << shift;
-      *tag_out = tag;
-      return rv + 1;
-    }
-  }
-  return 0; /* error: bad header */
-}
-
-/* sizeof(ScannedMember) must be <= (1UL<<BOUND_SIZEOF_SCANNED_MEMBER_LOG2) */
-#define BOUND_SIZEOF_SCANNED_MEMBER_LOG2 5
-typedef struct _ScannedMember ScannedMember;
-/** Field as it's being read. */
-struct _ScannedMember {
-  uint32_t tag;                          /**< Field tag. */
-  uint8_t wire_type;                     /**< Field type. */
-  uint8_t length_prefix_len;             /**< Prefix length. */
-  const ProtobufCFieldDescriptor *field; /**< Field descriptor. */
-  size_t len;                            /**< Field length. */
-  const uint8_t *data;                   /**< Pointer to field data. */
-};
-
-static inline uint32_t scan_length_prefixed_data(size_t len,
-                                                 const uint8_t *data,
-                                                 size_t *prefix_len_out) {
-  unsigned hdr_max = len < 5 ? len : 5;
-  unsigned hdr_len;
-  uint32_t val = 0;
-  unsigned i;
-  unsigned shift = 0;
-
-  for (i = 0; i < hdr_max; i++) {
-    val |= (data[i] & 0x7f) << shift;
-    shift += 7;
-    if ((data[i] & 0x80) == 0) break;
-  }
-  if (i == hdr_max) {
-    PROTOBUF_C_UNPACK_ERROR("error parsing length for length-prefixed data");
-    return 0;
-  }
-  hdr_len = i + 1;
-  *prefix_len_out = hdr_len;
-  if (hdr_len + val > len) {
-    PROTOBUF_C_UNPACK_ERROR("data too short after length-prefix of %u", val);
-    return 0;
-  }
-  return hdr_len + val;
-}
-
-static size_t max_b128_numbers(size_t len, const uint8_t *data) {
-  size_t rv = 0;
-  while (len--)
-    if ((*data++ & 0x80) == 0) ++rv;
-  return rv;
-}
-
-/**@}*/
-
-/**
- * Merge earlier message into a latter message.
- *
- * For numeric types and strings, if the same value appears multiple
- * times, the parser accepts the last value it sees. For embedded
- * message fields, the parser merges multiple instances of the same
- * field. That is, all singular scalar fields in the latter instance
- * replace those in the former, singular embedded messages are merged,
- * and repeated fields are concatenated.
- *
- * The earlier message should be freed after calling this function, as
- * some of its fields may have been reused and changed to their default
- * values during the merge.
- */
-static protobuf_c_boolean merge_messages(ProtobufCMessage *earlier_msg,
-                                         ProtobufCMessage *latter_msg,
-                                         ProtobufCAllocator *allocator) {
-  unsigned i;
-  const ProtobufCFieldDescriptor *fields = latter_msg->descriptor->fields;
-  for (i = 0; i < latter_msg->descriptor->n_fields; i++) {
-    if (fields[i].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n_earlier =
-          STRUCT_MEMBER_PTR(size_t, earlier_msg, fields[i].quantifier_offset);
-      uint8_t **p_earlier =
-          STRUCT_MEMBER_PTR(uint8_t *, earlier_msg, fields[i].offset);
-      size_t *n_latter =
-          STRUCT_MEMBER_PTR(size_t, latter_msg, fields[i].quantifier_offset);
-      uint8_t **p_latter =
-          STRUCT_MEMBER_PTR(uint8_t *, latter_msg, fields[i].offset);
-
-      if (*n_earlier > 0) {
-        if (*n_latter > 0) {
-          /* Concatenate the repeated field */
-          size_t el_size = sizeof_elt_in_repeated_array(fields[i].type);
-          uint8_t *new_field;
-
-          new_field = do_alloc(allocator, (*n_earlier + *n_latter) * el_size);
-          if (!new_field) return FALSE;
-
-          memcpy(new_field, *p_earlier, *n_earlier * el_size);
-          memcpy(new_field + *n_earlier * el_size, *p_latter,
-                 *n_latter * el_size);
-
-          do_free(allocator, *p_latter);
-          do_free(allocator, *p_earlier);
-          *p_latter = new_field;
-          *n_latter = *n_earlier + *n_latter;
-        } else {
-          /* Zero copy the repeated field from the earlier message */
-          *n_latter = *n_earlier;
-          *p_latter = *p_earlier;
-        }
-        /* Make sure the field does not get double freed */
-        *n_earlier = 0;
-        *p_earlier = 0;
-      }
-    } else if (fields[i].label == PROTOBUF_C_LABEL_OPTIONAL ||
-               fields[i].label == PROTOBUF_C_LABEL_NONE) {
-      const ProtobufCFieldDescriptor *field;
-      uint32_t *earlier_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, earlier_msg, fields[i].quantifier_offset);
-      uint32_t *latter_case_p =
-          STRUCT_MEMBER_PTR(uint32_t, latter_msg, fields[i].quantifier_offset);
-      protobuf_c_boolean need_to_merge = FALSE;
-      void *earlier_elem;
-      void *latter_elem;
-      const void *def_val;
-
-      if (fields[i].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) {
-        if (*latter_case_p == 0) {
-          /* lookup correct oneof field */
-          int field_index = int_range_lookup(
-              latter_msg->descriptor->n_field_ranges,
-              latter_msg->descriptor->field_ranges, *earlier_case_p);
-          field = latter_msg->descriptor->fields + field_index;
-        } else {
-          /* Oneof is present in the latter message, move on */
-          continue;
-        }
-      } else {
-        field = &fields[i];
-      }
-
-      earlier_elem = STRUCT_MEMBER_P(earlier_msg, field->offset);
-      latter_elem = STRUCT_MEMBER_P(latter_msg, field->offset);
-      def_val = field->default_value;
-
-      switch (field->type) {
-        case PROTOBUF_C_TYPE_MESSAGE: {
-          ProtobufCMessage *em = *(ProtobufCMessage **)earlier_elem;
-          ProtobufCMessage *lm = *(ProtobufCMessage **)latter_elem;
-          if (em != NULL) {
-            if (lm != NULL) {
-              if (!merge_messages(em, lm, allocator)) return FALSE;
-              /* Already merged */
-              need_to_merge = FALSE;
-            } else {
-              /* Zero copy the message */
-              need_to_merge = TRUE;
-            }
-          }
-          break;
-        }
-        case PROTOBUF_C_TYPE_BYTES: {
-          uint8_t *e_data = ((ProtobufCBinaryData *)earlier_elem)->data;
-          uint8_t *l_data = ((ProtobufCBinaryData *)latter_elem)->data;
-          const ProtobufCBinaryData *d_bd = (ProtobufCBinaryData *)def_val;
-
-          need_to_merge =
-              (e_data != NULL && (d_bd == NULL || e_data != d_bd->data)) &&
-              (l_data == NULL || (d_bd != NULL && l_data == d_bd->data));
-          break;
-        }
-        case PROTOBUF_C_TYPE_STRING: {
-          char *e_str = *(char **)earlier_elem;
-          char *l_str = *(char **)latter_elem;
-          const char *d_str = def_val;
-
-          need_to_merge = e_str != d_str && l_str == d_str;
-          break;
-        }
-        default: {
-          /* Could be has field or case enum, the logic is
-           * equivalent, since 0 (FALSE) means not set for
-           * oneof */
-          need_to_merge = (*earlier_case_p != 0) && (*latter_case_p == 0);
-          break;
-        }
-      }
-
-      if (need_to_merge) {
-        size_t el_size = sizeof_elt_in_repeated_array(field->type);
-        memcpy(latter_elem, earlier_elem, el_size);
-        /*
-         * Reset the element from the old message to 0
-         * to make sure earlier message deallocation
-         * doesn't corrupt zero-copied data in the new
-         * message, earlier message will be freed after
-         * this function is called anyway
-         */
-        memset(earlier_elem, 0, el_size);
-
-        if (field->quantifier_offset != 0) {
-          /* Set the has field or the case enum,
-           * if applicable */
-          *latter_case_p = *earlier_case_p;
-          *earlier_case_p = 0;
-        }
-      }
-    }
-  }
-  return TRUE;
-}
-
-/**
- * Count packed elements.
- *
- * Given a raw slab of packed-repeated values, determine the number of
- * elements. This function detects certain kinds of errors but not
- * others; the remaining error checking is done by
- * parse_packed_repeated_member().
- */
-static protobuf_c_boolean count_packed_elements(ProtobufCType type, size_t len,
-                                                const uint8_t *data,
-                                                size_t *count_out) {
-  switch (type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (len % 4 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 4 for fixed-length 32-bit types");
-        return FALSE;
-      }
-      *count_out = len / 4;
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (len % 8 != 0) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "length must be a multiple of 8 for fixed-length 64-bit types");
-        return FALSE;
-      }
-      *count_out = len / 8;
-      return TRUE;
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-    case PROTOBUF_C_TYPE_SINT32:
-    case PROTOBUF_C_TYPE_UINT32:
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_SINT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      *count_out = max_b128_numbers(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *count_out = len;
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING:
-    case PROTOBUF_C_TYPE_BYTES:
-    case PROTOBUF_C_TYPE_MESSAGE:
-    default:
-      PROTOBUF_C_UNPACK_ERROR("bad protobuf-c type %u for packed-repeated",
-                              type);
-      return FALSE;
-  }
-}
-
-static inline uint32_t parse_uint32(unsigned len, const uint8_t *data) {
-  uint32_t rv = data[0] & 0x7f;
-  if (len > 1) {
-    rv |= ((uint32_t)(data[1] & 0x7f) << 7);
-    if (len > 2) {
-      rv |= ((uint32_t)(data[2] & 0x7f) << 14);
-      if (len > 3) {
-        rv |= ((uint32_t)(data[3] & 0x7f) << 21);
-        if (len > 4) rv |= ((uint32_t)(data[4]) << 28);
-      }
-    }
-  }
-  return rv;
-}
-
-static inline uint32_t parse_int32(unsigned len, const uint8_t *data) {
-  return parse_uint32(len, data);
-}
-
-static inline int32_t unzigzag32(uint32_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint32_t parse_fixed_uint32(const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint32_t t;
-  memcpy(&t, data, 4);
-  return t;
-#else
-  return data[0] | ((uint32_t)(data[1]) << 8) | ((uint32_t)(data[2]) << 16) |
-         ((uint32_t)(data[3]) << 24);
-#endif
-}
-
-static uint64_t parse_uint64(unsigned len, const uint8_t *data) {
-  unsigned shift, i;
-  uint64_t rv;
-
-  if (len < 5) return parse_uint32(len, data);
-  rv = ((uint64_t)(data[0] & 0x7f)) | ((uint64_t)(data[1] & 0x7f) << 7) |
-       ((uint64_t)(data[2] & 0x7f) << 14) | ((uint64_t)(data[3] & 0x7f) << 21);
-  shift = 28;
-  for (i = 4; i < len; i++) {
-    rv |= (((uint64_t)(data[i] & 0x7f)) << shift);
-    shift += 7;
-  }
-  return rv;
-}
-
-static inline int64_t unzigzag64(uint64_t v) {
-  if (v & 1)
-    return -(v >> 1) - 1;
-  else
-    return v >> 1;
-}
-
-static inline uint64_t parse_fixed_uint64(const uint8_t *data) {
-#if !defined(WORDS_BIGENDIAN)
-  uint64_t t;
-  memcpy(&t, data, 8);
-  return t;
-#else
-  return (uint64_t)parse_fixed_uint32(data) |
-         (((uint64_t)parse_fixed_uint32(data + 4)) << 32);
-#endif
-}
-
-static protobuf_c_boolean parse_boolean(unsigned len, const uint8_t *data) {
-  unsigned i;
-  for (i = 0; i < len; i++)
-    if (data[i] & 0x7f) return TRUE;
-  return FALSE;
-}
-
-static protobuf_c_boolean parse_required_member(
-    ScannedMember *scanned_member, void *member, ProtobufCAllocator *allocator,
-    protobuf_c_boolean maybe_clear) {
-  unsigned len = scanned_member->len;
-  const uint8_t *data = scanned_member->data;
-  ProtobufCWireType wire_type = scanned_member->wire_type;
-
-  switch (scanned_member->field->type) {
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = parse_int32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_UINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint32_t *)member = parse_uint32(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT32:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int32_t *)member = unzigzag32(parse_uint32(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_32BIT) return FALSE;
-      *(uint32_t *)member = parse_fixed_uint32(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(uint64_t *)member = parse_uint64(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_SINT64:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_VARINT) return FALSE;
-      *(int64_t *)member = unzigzag64(parse_uint64(len, data));
-      return TRUE;
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_64BIT) return FALSE;
-      *(uint64_t *)member = parse_fixed_uint64(data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_BOOL:
-      *(protobuf_c_boolean *)member = parse_boolean(len, data);
-      return TRUE;
-    case PROTOBUF_C_TYPE_STRING: {
-      char **pstr = member;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      if (maybe_clear && *pstr != NULL) {
-        const char *def = scanned_member->field->default_value;
-        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
-      }
-      *pstr = do_alloc(allocator, len - pref_len + 1);
-      if (*pstr == NULL) return FALSE;
-      memcpy(*pstr, data + pref_len, len - pref_len);
-      (*pstr)[len - pref_len] = 0;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_BYTES: {
-      ProtobufCBinaryData *bd = member;
-      const ProtobufCBinaryData *def_bd;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_bd = scanned_member->field->default_value;
-      if (maybe_clear && bd->data != NULL &&
-          (def_bd == NULL || bd->data != def_bd->data)) {
-        do_free(allocator, bd->data);
-      }
-      if (len - pref_len > 0) {
-        bd->data = do_alloc(allocator, len - pref_len);
-        if (bd->data == NULL) return FALSE;
-        memcpy(bd->data, data + pref_len, len - pref_len);
-      } else {
-        bd->data = NULL;
-      }
-      bd->len = len - pref_len;
-      return TRUE;
-    }
-    case PROTOBUF_C_TYPE_MESSAGE: {
-      ProtobufCMessage **pmessage = member;
-      ProtobufCMessage *subm;
-      const ProtobufCMessage *def_mess;
-      protobuf_c_boolean merge_successful = TRUE;
-      unsigned pref_len = scanned_member->length_prefix_len;
-
-      if (wire_type != PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED) return FALSE;
-
-      def_mess = scanned_member->field->default_value;
-      subm =
-          protobuf_c_message_unpack(scanned_member->field->descriptor,
-                                    allocator, len - pref_len, data + pref_len);
-
-      if (maybe_clear && *pmessage != NULL && *pmessage != def_mess) {
-        if (subm != NULL)
-          merge_successful = merge_messages(*pmessage, subm, allocator);
-        /* Delete the previous message */
-        protobuf_c_message_free_unpacked(*pmessage, allocator);
-      }
-      *pmessage = subm;
-      if (subm == NULL || !merge_successful) return FALSE;
-      return TRUE;
-    }
-  }
-  return FALSE;
-}
-
-static protobuf_c_boolean parse_oneof_member(ScannedMember *scanned_member,
-                                             void *member,
-                                             ProtobufCMessage *message,
-                                             ProtobufCAllocator *allocator) {
-  uint32_t *oneof_case = STRUCT_MEMBER_PTR(
-      uint32_t, message, scanned_member->field->quantifier_offset);
-
-  /* If we have already parsed a member of this oneof, free it. */
-  if (*oneof_case != 0) {
-    /* lookup field */
-    int field_index =
-        int_range_lookup(message->descriptor->n_field_ranges,
-                         message->descriptor->field_ranges, *oneof_case);
-    const ProtobufCFieldDescriptor *old_field =
-        message->descriptor->fields + field_index;
-    size_t el_size = sizeof_elt_in_repeated_array(old_field->type);
-
-    switch (old_field->type) {
-      case PROTOBUF_C_TYPE_STRING: {
-        char **pstr = member;
-        const char *def = old_field->default_value;
-        if (*pstr != NULL && *pstr != def) do_free(allocator, *pstr);
-        break;
-      }
-      case PROTOBUF_C_TYPE_BYTES: {
-        ProtobufCBinaryData *bd = member;
-        const ProtobufCBinaryData *def_bd = old_field->default_value;
-        if (bd->data != NULL && (def_bd == NULL || bd->data != def_bd->data)) {
-          do_free(allocator, bd->data);
-        }
-        break;
-      }
-      case PROTOBUF_C_TYPE_MESSAGE: {
-        ProtobufCMessage **pmessage = member;
-        const ProtobufCMessage *def_mess = old_field->default_value;
-        if (*pmessage != NULL && *pmessage != def_mess)
-          protobuf_c_message_free_unpacked(*pmessage, allocator);
-        break;
-      }
-      default:
-        break;
-    }
-
-    memset(member, 0, el_size);
-  }
-  if (!parse_required_member(scanned_member, member, allocator, TRUE))
-    return FALSE;
-
-  *oneof_case = scanned_member->tag;
-  return TRUE;
-}
-
-static protobuf_c_boolean parse_optional_member(ScannedMember *scanned_member,
-                                                void *member,
-                                                ProtobufCMessage *message,
-                                                ProtobufCAllocator *allocator) {
-  if (!parse_required_member(scanned_member, member, allocator, TRUE))
-    return FALSE;
-  if (scanned_member->field->quantifier_offset != 0)
-    STRUCT_MEMBER(protobuf_c_boolean, message,
-                  scanned_member->field->quantifier_offset) = TRUE;
-  return TRUE;
-}
-
-static protobuf_c_boolean parse_repeated_member(ScannedMember *scanned_member,
-                                                void *member,
-                                                ProtobufCMessage *message,
-                                                ProtobufCAllocator *allocator) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz = sizeof_elt_in_repeated_array(field->type);
-  char *array = *(char **)member;
-
-  if (!parse_required_member(scanned_member, array + siz * (*p_n), allocator,
-                             FALSE)) {
-    return FALSE;
-  }
-  *p_n += 1;
-  return TRUE;
-}
-
-static unsigned scan_varint(unsigned len, const uint8_t *data) {
-  unsigned i;
-  if (len > 10) len = 10;
-  for (i = 0; i < len; i++)
-    if ((data[i] & 0x80) == 0) break;
-  if (i == len) return 0;
-  return i + 1;
-}
-
-static protobuf_c_boolean parse_packed_repeated_member(
-    ScannedMember *scanned_member, void *member, ProtobufCMessage *message) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  size_t *p_n = STRUCT_MEMBER_PTR(size_t, message, field->quantifier_offset);
-  size_t siz = sizeof_elt_in_repeated_array(field->type);
-  void *array = *(char **)member + siz * (*p_n);
-  const uint8_t *at = scanned_member->data + scanned_member->length_prefix_len;
-  size_t rem = scanned_member->len - scanned_member->length_prefix_len;
-  size_t count = 0;
-  unsigned i;
-
-  switch (field->type) {
-    case PROTOBUF_C_TYPE_SFIXED32:
-    case PROTOBUF_C_TYPE_FIXED32:
-    case PROTOBUF_C_TYPE_FLOAT:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 4;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint32_t *)array)[i] = parse_fixed_uint32(at);
-        at += 4;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_SFIXED64:
-    case PROTOBUF_C_TYPE_FIXED64:
-    case PROTOBUF_C_TYPE_DOUBLE:
-      count = (scanned_member->len - scanned_member->length_prefix_len) / 8;
-#if !defined(WORDS_BIGENDIAN)
-      goto no_unpacking_needed;
-#else
-      for (i = 0; i < count; i++) {
-        ((uint64_t *)array)[i] = parse_fixed_uint64(at);
-        at += 8;
-      }
-      break;
-#endif
-    case PROTOBUF_C_TYPE_ENUM:
-    case PROTOBUF_C_TYPE_INT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] = parse_int32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_SINT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint32 value");
-          return FALSE;
-        }
-        ((int32_t *)array)[count++] = unzigzag32(parse_uint32(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_UINT32:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated enum or uint32 value");
-          return FALSE;
-        }
-        ((uint32_t *)array)[count++] = parse_uint32(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-
-    case PROTOBUF_C_TYPE_SINT64:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated sint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = unzigzag64(parse_uint64(s, at));
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_INT64:
-    case PROTOBUF_C_TYPE_UINT64:
-      while (rem > 0) {
-        unsigned s = scan_varint(rem, at);
-        if (s == 0) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated int64/uint64 value");
-          return FALSE;
-        }
-        ((int64_t *)array)[count++] = parse_uint64(s, at);
-        at += s;
-        rem -= s;
-      }
-      break;
-    case PROTOBUF_C_TYPE_BOOL:
-      count = rem;
-      for (i = 0; i < count; i++) {
-        if (at[i] > 1) {
-          PROTOBUF_C_UNPACK_ERROR("bad packed-repeated boolean value");
-          return FALSE;
-        }
-        ((protobuf_c_boolean *)array)[i] = at[i];
-      }
-      break;
-    default:
-      PROTOBUF_C__ASSERT_NOT_REACHED();
-  }
-  *p_n += count;
-  return TRUE;
-
-#if !defined(WORDS_BIGENDIAN)
-no_unpacking_needed:
-  memcpy(array, at, count * siz);
-  *p_n += count;
-  return TRUE;
-#endif
-}
-
-static protobuf_c_boolean is_packable_type(ProtobufCType type) {
-  return type != PROTOBUF_C_TYPE_STRING && type != PROTOBUF_C_TYPE_BYTES &&
-         type != PROTOBUF_C_TYPE_MESSAGE;
-}
-
-static protobuf_c_boolean parse_member(ScannedMember *scanned_member,
-                                       ProtobufCMessage *message,
-                                       ProtobufCAllocator *allocator) {
-  const ProtobufCFieldDescriptor *field = scanned_member->field;
-  void *member;
-
-  if (field == NULL) {
-    ProtobufCMessageUnknownField *ufield =
-        message->unknown_fields + (message->n_unknown_fields++);
-    ufield->tag = scanned_member->tag;
-    ufield->wire_type = scanned_member->wire_type;
-    ufield->len = scanned_member->len;
-    ufield->data = do_alloc(allocator, scanned_member->len);
-    if (ufield->data == NULL) return FALSE;
-    memcpy(ufield->data, scanned_member->data, ufield->len);
-    return TRUE;
-  }
-  member = (char *)message + field->offset;
-  switch (field->label) {
-    case PROTOBUF_C_LABEL_REQUIRED:
-      return parse_required_member(scanned_member, member, allocator, TRUE);
-    case PROTOBUF_C_LABEL_OPTIONAL:
-    case PROTOBUF_C_LABEL_NONE:
-      if (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_ONEOF)) {
-        return parse_oneof_member(scanned_member, member, message, allocator);
-      } else {
-        return parse_optional_member(scanned_member, member, message,
-                                     allocator);
-      }
-    case PROTOBUF_C_LABEL_REPEATED:
-      if (scanned_member->wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           is_packable_type(field->type))) {
-        return parse_packed_repeated_member(scanned_member, member, message);
-      } else {
-        return parse_repeated_member(scanned_member, member, message,
-                                     allocator);
-      }
-  }
-  PROTOBUF_C__ASSERT_NOT_REACHED();
-  return 0;
-}
-
-/**
- * Initialise messages generated by old code.
- *
- * This function is used if desc->message_init == NULL (which occurs
- * for old code, and which would be useful to support allocating
- * descriptors dynamically).
- */
-static void message_init_generic(const ProtobufCMessageDescriptor *desc,
-                                 ProtobufCMessage *message) {
-  unsigned i;
-
-  memset(message, 0, desc->sizeof_message);
-  message->descriptor = desc;
-  for (i = 0; i < desc->n_fields; i++) {
-    if (desc->fields[i].default_value != NULL &&
-        desc->fields[i].label != PROTOBUF_C_LABEL_REPEATED) {
-      void *field = STRUCT_MEMBER_P(message, desc->fields[i].offset);
-      const void *dv = desc->fields[i].default_value;
-
-      switch (desc->fields[i].type) {
-        case PROTOBUF_C_TYPE_INT32:
-        case PROTOBUF_C_TYPE_SINT32:
-        case PROTOBUF_C_TYPE_SFIXED32:
-        case PROTOBUF_C_TYPE_UINT32:
-        case PROTOBUF_C_TYPE_FIXED32:
-        case PROTOBUF_C_TYPE_FLOAT:
-        case PROTOBUF_C_TYPE_ENUM:
-          memcpy(field, dv, 4);
-          break;
-        case PROTOBUF_C_TYPE_INT64:
-        case PROTOBUF_C_TYPE_SINT64:
-        case PROTOBUF_C_TYPE_SFIXED64:
-        case PROTOBUF_C_TYPE_UINT64:
-        case PROTOBUF_C_TYPE_FIXED64:
-        case PROTOBUF_C_TYPE_DOUBLE:
-          memcpy(field, dv, 8);
-          break;
-        case PROTOBUF_C_TYPE_BOOL:
-          memcpy(field, dv, sizeof(protobuf_c_boolean));
-          break;
-        case PROTOBUF_C_TYPE_BYTES:
-          memcpy(field, dv, sizeof(ProtobufCBinaryData));
-          break;
-
-        case PROTOBUF_C_TYPE_STRING:
-        case PROTOBUF_C_TYPE_MESSAGE:
-          /*
-           * The next line essentially implements a cast
-           * from const, which is totally unavoidable.
-           */
-          *(const void **)field = dv;
-          break;
-      }
-    }
-  }
-}
-
-/**@}*/
-
-/*
- * ScannedMember slabs (an unpacking implementation detail). Before doing real
- * unpacking, we first scan through the elements to see how many there are (for
- * repeated fields), and which field to use (for non-repeated fields given
- * twice).
- *
- * In order to avoid allocations for small messages, we keep a stack-allocated
- * slab of ScannedMembers of size FIRST_SCANNED_MEMBER_SLAB_SIZE (16). After we
- * fill that up, we allocate each slab twice as large as the previous one.
- */
-#define FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2 4
-
-/*
- * The number of slabs, including the stack-allocated ones; choose the number so
- * that we would overflow if we needed a slab larger than provided.
- */
-#define MAX_SCANNED_MEMBER_SLAB                                      \
-  (sizeof(unsigned int) * 8 - 1 - BOUND_SIZEOF_SCANNED_MEMBER_LOG2 - \
-   FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2)
-
-#define REQUIRED_FIELD_BITMAP_SET(index) \
-  (required_fields_bitmap[(index) / 8] |= (1UL << ((index) % 8)))
-
-#define REQUIRED_FIELD_BITMAP_IS_SET(index) \
-  (required_fields_bitmap[(index) / 8] & (1UL << ((index) % 8)))
-
-ProtobufCMessage *protobuf_c_message_unpack(
-    const ProtobufCMessageDescriptor *desc, ProtobufCAllocator *allocator,
-    size_t len, const uint8_t *data) {
-  ProtobufCMessage *rv;
-  size_t rem = len;
-  const uint8_t *at = data;
-  const ProtobufCFieldDescriptor *last_field = desc->fields + 0;
-  ScannedMember first_member_slab[1UL << FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2];
-
-  /*
-   * scanned_member_slabs[i] is an array of arrays of ScannedMember.
-   * The first slab (scanned_member_slabs[0] is just a pointer to
-   * first_member_slab), above. All subsequent slabs will be allocated
-   * using the allocator.
-   */
-  ScannedMember *scanned_member_slabs[MAX_SCANNED_MEMBER_SLAB + 1];
-  unsigned which_slab = 0;    /* the slab we are currently populating */
-  unsigned in_slab_index = 0; /* number of members in the slab */
-  size_t n_unknown = 0;
-  unsigned f;
-  unsigned j;
-  unsigned i_slab;
-  unsigned last_field_index = 0;
-  unsigned required_fields_bitmap_len;
-  unsigned char required_fields_bitmap_stack[16];
-  unsigned char *required_fields_bitmap = required_fields_bitmap_stack;
-  protobuf_c_boolean required_fields_bitmap_alloced = FALSE;
-
-  ASSERT_IS_MESSAGE_DESCRIPTOR(desc);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-
-  rv = do_alloc(allocator, desc->sizeof_message);
-  if (!rv) return (NULL);
-  scanned_member_slabs[0] = first_member_slab;
-
-  required_fields_bitmap_len = (desc->n_fields + 7) / 8;
-  if (required_fields_bitmap_len > sizeof(required_fields_bitmap_stack)) {
-    required_fields_bitmap = do_alloc(allocator, required_fields_bitmap_len);
-    if (!required_fields_bitmap) {
-      do_free(allocator, rv);
-      return (NULL);
-    }
-    required_fields_bitmap_alloced = TRUE;
-  }
-  memset(required_fields_bitmap, 0, required_fields_bitmap_len);
-
-  /*
-   * Generated code always defines "message_init". However, we provide a
-   * fallback for (1) users of old protobuf-c generated-code that do not
-   * provide the function, and (2) descriptors constructed from some other
-   * source (most likely, direct construction from the .proto file).
-   */
-  if (desc->message_init != NULL)
-    protobuf_c_message_init(desc, rv);
-  else
-    message_init_generic(desc, rv);
-
-  while (rem > 0) {
-    uint32_t tag;
-    ProtobufCWireType wire_type;
-    size_t used = parse_tag_and_wiretype(rem, at, &tag, &wire_type);
-    const ProtobufCFieldDescriptor *field;
-    ScannedMember tmp;
-
-    if (used == 0) {
-      PROTOBUF_C_UNPACK_ERROR("error parsing tag/wiretype at offset %u",
-                              (unsigned)(at - data));
-      goto error_cleanup_during_scan;
-    }
-    /*
-     * \todo Consider optimizing for field[1].id == tag, if field[1]
-     * exists!
-     */
-    if (last_field == NULL || last_field->id != tag) {
-      /* lookup field */
-      int field_index =
-          int_range_lookup(desc->n_field_ranges, desc->field_ranges, tag);
-      if (field_index < 0) {
-        field = NULL;
-        n_unknown++;
-      } else {
-        field = desc->fields + field_index;
-        last_field = field;
-        last_field_index = field_index;
-      }
-    } else {
-      field = last_field;
-    }
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REQUIRED)
-      REQUIRED_FIELD_BITMAP_SET(last_field_index);
-
-    at += used;
-    rem -= used;
-    tmp.tag = tag;
-    tmp.wire_type = wire_type;
-    tmp.field = field;
-    tmp.data = at;
-    tmp.length_prefix_len = 0;
-
-    switch (wire_type) {
-      case PROTOBUF_C_WIRE_TYPE_VARINT: {
-        unsigned max_len = rem < 10 ? rem : 10;
-        unsigned i;
-
-        for (i = 0; i < max_len; i++)
-          if ((at[i] & 0x80) == 0) break;
-        if (i == max_len) {
-          PROTOBUF_C_UNPACK_ERROR("unterminated varint at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = i + 1;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_64BIT:
-        if (rem < 8) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 64bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 8;
-        break;
-      case PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED: {
-        size_t pref_len;
-
-        tmp.len = scan_length_prefixed_data(rem, at, &pref_len);
-        if (tmp.len == 0) {
-          /* NOTE: scan_length_prefixed_data calls UNPACK_ERROR */
-          goto error_cleanup_during_scan;
-        }
-        tmp.length_prefix_len = pref_len;
-        break;
-      }
-      case PROTOBUF_C_WIRE_TYPE_32BIT:
-        if (rem < 4) {
-          PROTOBUF_C_UNPACK_ERROR("too short after 32bit wiretype at offset %u",
-                                  (unsigned)(at - data));
-          goto error_cleanup_during_scan;
-        }
-        tmp.len = 4;
-        break;
-      default:
-        PROTOBUF_C_UNPACK_ERROR("unsupported tag %u at offset %u", wire_type,
-                                (unsigned)(at - data));
-        goto error_cleanup_during_scan;
-    }
-
-    if (in_slab_index ==
-        (1UL << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2))) {
-      size_t size;
-
-      in_slab_index = 0;
-      if (which_slab == MAX_SCANNED_MEMBER_SLAB) {
-        PROTOBUF_C_UNPACK_ERROR("too many fields");
-        goto error_cleanup_during_scan;
-      }
-      which_slab++;
-      size = sizeof(ScannedMember)
-             << (which_slab + FIRST_SCANNED_MEMBER_SLAB_SIZE_LOG2);
-      scanned_member_slabs[which_slab] = do_alloc(allocator, size);
-      if (scanned_member_slabs[which_slab] == NULL)
-        goto error_cleanup_during_scan;
-    }
-    scanned_member_slabs[which_slab][in_slab_index++] = tmp;
-
-    if (field != NULL && field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *n = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (wire_type == PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED &&
-          (0 != (field->flags & PROTOBUF_C_FIELD_FLAG_PACKED) ||
-           is_packable_type(field->type))) {
-        size_t count;
-        if (!count_packed_elements(field->type, tmp.len - tmp.length_prefix_len,
-                                   tmp.data + tmp.length_prefix_len, &count)) {
-          PROTOBUF_C_UNPACK_ERROR("counting packed elements");
-          goto error_cleanup_during_scan;
-        }
-        *n += count;
-      } else {
-        *n += 1;
-      }
-    }
-
-    at += tmp.len;
-    rem -= tmp.len;
-  }
-
-  /* allocate space for repeated fields, also check that all required fields
-   * have been set */
-  for (f = 0; f < desc->n_fields; f++) {
-    const ProtobufCFieldDescriptor *field = desc->fields + f;
-    if (field->label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t siz = sizeof_elt_in_repeated_array(field->type);
-      size_t *n_ptr = STRUCT_MEMBER_PTR(size_t, rv, field->quantifier_offset);
-      if (*n_ptr != 0) {
-        unsigned n = *n_ptr;
-        void *a;
-        *n_ptr = 0;
-        assert(rv->descriptor != NULL);
-#define CLEAR_REMAINING_N_PTRS()                               \
-  for (f++; f < desc->n_fields; f++) {                         \
-    field = desc->fields + f;                                  \
-    if (field->label == PROTOBUF_C_LABEL_REPEATED)             \
-      STRUCT_MEMBER(size_t, rv, field->quantifier_offset) = 0; \
-  }
-        a = do_alloc(allocator, siz * n);
-        if (!a) {
-          CLEAR_REMAINING_N_PTRS();
-          goto error_cleanup;
-        }
-        STRUCT_MEMBER(void *, rv, field->offset) = a;
-      }
-    } else if (field->label == PROTOBUF_C_LABEL_REQUIRED) {
-      if (field->default_value == NULL && !REQUIRED_FIELD_BITMAP_IS_SET(f)) {
-        CLEAR_REMAINING_N_PTRS();
-        PROTOBUF_C_UNPACK_ERROR("message '%s': missing required field '%s'",
-                                desc->name, field->name);
-        goto error_cleanup;
-      }
-    }
-  }
-#undef CLEAR_REMAINING_N_PTRS
-
-  /* allocate space for unknown fields */
-  if (n_unknown) {
-    rv->unknown_fields =
-        do_alloc(allocator, n_unknown * sizeof(ProtobufCMessageUnknownField));
-    if (rv->unknown_fields == NULL) goto error_cleanup;
-  }
-
-  /* do real parsing */
-  for (i_slab = 0; i_slab <= which_slab; i_slab++) {
-    unsigned max =
-        (i_slab == which_slab) ? in_slab_index : (1UL << (i_slab + 4));
-    ScannedMember *slab = scanned_member_slabs[i_slab];
-
-    for (j = 0; j < max; j++) {
-      if (!parse_member(slab + j, rv, allocator)) {
-        PROTOBUF_C_UNPACK_ERROR(
-            "error parsing member %s of %s",
-            slab->field ? slab->field->name : "*unknown-field*", desc->name);
-        goto error_cleanup;
-      }
-    }
-  }
-
-  /* cleanup */
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return rv;
-
-error_cleanup:
-  protobuf_c_message_free_unpacked(rv, allocator);
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return NULL;
-
-error_cleanup_during_scan:
-  do_free(allocator, rv);
-  for (j = 1; j <= which_slab; j++) do_free(allocator, scanned_member_slabs[j]);
-  if (required_fields_bitmap_alloced)
-    do_free(allocator, required_fields_bitmap);
-  return NULL;
-}
-
-void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
-                                      ProtobufCAllocator *allocator) {
-  const ProtobufCMessageDescriptor *desc;
-  unsigned f;
-
-  if (message == NULL) return;
-
-  desc = message->descriptor;
-
-  ASSERT_IS_MESSAGE(message);
-
-  if (allocator == NULL) allocator = &protobuf_c__allocator;
-  message->descriptor = NULL;
-  for (f = 0; f < desc->n_fields; f++) {
-    if (0 != (desc->fields[f].flags & PROTOBUF_C_FIELD_FLAG_ONEOF) &&
-        desc->fields[f].id !=
-            STRUCT_MEMBER(uint32_t, message,
-                          desc->fields[f].quantifier_offset)) {
-      /* This is not the selected oneof, skip it */
-      continue;
-    }
-
-    if (desc->fields[f].label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t n =
-          STRUCT_MEMBER(size_t, message, desc->fields[f].quantifier_offset);
-      void *arr = STRUCT_MEMBER(void *, message, desc->fields[f].offset);
-
-      if (arr != NULL) {
-        if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-          unsigned i;
-          for (i = 0; i < n; i++) do_free(allocator, ((char **)arr)[i]);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            do_free(allocator, ((ProtobufCBinaryData *)arr)[i].data);
-        } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-          unsigned i;
-          for (i = 0; i < n; i++)
-            protobuf_c_message_free_unpacked(((ProtobufCMessage **)arr)[i],
-                                             allocator);
-        }
-        do_free(allocator, arr);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_STRING) {
-      char *str = STRUCT_MEMBER(char *, message, desc->fields[f].offset);
-
-      if (str && str != desc->fields[f].default_value) do_free(allocator, str);
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_BYTES) {
-      void *data =
-          STRUCT_MEMBER(ProtobufCBinaryData, message, desc->fields[f].offset)
-              .data;
-      const ProtobufCBinaryData *default_bd;
-
-      default_bd = desc->fields[f].default_value;
-      if (data != NULL && (default_bd == NULL || default_bd->data != data)) {
-        do_free(allocator, data);
-      }
-    } else if (desc->fields[f].type == PROTOBUF_C_TYPE_MESSAGE) {
-      ProtobufCMessage *sm;
-
-      sm = STRUCT_MEMBER(ProtobufCMessage *, message, desc->fields[f].offset);
-      if (sm && sm != desc->fields[f].default_value)
-        protobuf_c_message_free_unpacked(sm, allocator);
-    }
-  }
-
-  for (f = 0; f < message->n_unknown_fields; f++)
-    do_free(allocator, message->unknown_fields[f].data);
-  if (message->unknown_fields != NULL)
-    do_free(allocator, message->unknown_fields);
-
-  do_free(allocator, message);
-}
-
-void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
-                             void *message) {
-  descriptor->message_init((ProtobufCMessage *)(message));
-}
-
-protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *message) {
-  unsigned i;
-
-  if (!message || !message->descriptor ||
-      message->descriptor->magic != PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC) {
-    return FALSE;
-  }
-
-  for (i = 0; i < message->descriptor->n_fields; i++) {
-    const ProtobufCFieldDescriptor *f = message->descriptor->fields + i;
-    ProtobufCType type = f->type;
-    ProtobufCLabel label = f->label;
-    void *field = STRUCT_MEMBER_P(message, f->offset);
-
-    if (label == PROTOBUF_C_LABEL_REPEATED) {
-      size_t *quantity = STRUCT_MEMBER_P(message, f->quantifier_offset);
-
-      if (*quantity > 0 && *(void **)field == NULL) {
-        return FALSE;
-      }
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        ProtobufCMessage **submessage = *(ProtobufCMessage ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!protobuf_c_message_check(submessage[j])) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char **string = *(char ***)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (!string[j]) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        ProtobufCBinaryData *bd = *(ProtobufCBinaryData **)field;
-        unsigned j;
-        for (j = 0; j < *quantity; j++) {
-          if (bd[j].len > 0 && bd[j].data == NULL) return FALSE;
-        }
-      }
-
-    } else { /* PROTOBUF_C_LABEL_REQUIRED or PROTOBUF_C_LABEL_OPTIONAL */
-
-      if (type == PROTOBUF_C_TYPE_MESSAGE) {
-        ProtobufCMessage *submessage = *(ProtobufCMessage **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || submessage != NULL) {
-          if (!protobuf_c_message_check(submessage)) return FALSE;
-        }
-      } else if (type == PROTOBUF_C_TYPE_STRING) {
-        char *string = *(char **)field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED && string == NULL) return FALSE;
-      } else if (type == PROTOBUF_C_TYPE_BYTES) {
-        protobuf_c_boolean *has =
-            STRUCT_MEMBER_P(message, f->quantifier_offset);
-        ProtobufCBinaryData *bd = field;
-        if (label == PROTOBUF_C_LABEL_REQUIRED || *has == TRUE) {
-          if (bd->len > 0 && bd->data == NULL) return FALSE;
-        }
-      }
-    }
-  }
-
-  return TRUE;
-}
-
-/* === services === */
-
-typedef void (*GenericHandler)(void *service, const ProtobufCMessage *input,
-                               ProtobufCClosure closure, void *closure_data);
diff --git a/mobile/tools/quantification/src/protobuf-c.h b/mobile/tools/quantification/src/protobuf-c.h
deleted file mode 100644
index bd85695b868af6c7b91590196339bc4f7826a256..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/protobuf-c.h
+++ /dev/null
@@ -1,921 +0,0 @@
-/*
- * Copyright (c) 2008-2017, Dave Benson and the protobuf-c authors.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *
- *     * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- *     * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*! \file
- * \mainpage Introduction
- *
- * This is [protobuf-c], a C implementation of [Protocol Buffers].
- *
- * This file defines the public API for the `libprotobuf-c` support library.
- * This API includes interfaces that can be used directly by client code as well
- * as the interfaces used by the code generated by the `protoc-c` compiler.
- *
- * The `libprotobuf-c` support library performs the actual serialization and
- * deserialization of Protocol Buffers messages. It interacts with structures,
- * definitions, and metadata generated by the `protoc-c` compiler from .proto
- * files.
- *
- * \authors Dave Benson and the `protobuf-c` authors.
- *
- * \copyright 2008-2014. Licensed under the terms of the [BSD-2-Clause] license.
- *
- * [protobuf-c]:       https://github.com/protobuf-c/protobuf-c
- * [Protocol Buffers]: https://developers.google.com/protocol-buffers/
- * [BSD-2-Clause]:     http://opensource.org/licenses/BSD-2-Clause
- *
- * \page gencode Generated Code
- *
- * For each enum, we generate a C enum. For each message, we generate a C
- * structure which can be cast to a `ProtobufCMessage`.
- *
- * For each enum and message, we generate a descriptor object that allows us to
- * implement a kind of reflection on the structures.
- *
- * First, some naming conventions:
- *
- * - The name of the type for enums and messages and services is camel case
- *   (meaning WordsAreCrammedTogether) except that double underscores are used
- *   to delimit scopes. For example, the following `.proto` file:
- *
-~~~{.proto}
-        package foo.bar;
-        message BazBah {
-            optional int32 val = 1;
-        }
-~~~
- *
- * would generate a C type `Foo__Bar__BazBah`.
- *
- * - Identifiers for functions and globals are all lowercase, with camel case
- *   words separated by single underscores. For example, one of the function
- *   prototypes generated by `protoc-c` for the above example:
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - Identifiers for enum values contain an uppercase prefix which embeds the
- *   package name and the enum type name.
- *
- * - A double underscore is used to separate further components of identifier
- *   names.
- *
- * For example, in the name of the unpack function above, the package name
- * `foo.bar` has become `foo__bar`, the message name BazBah has become
- * `baz_bah`, and the method name is `unpack`. These are all joined with double
- * underscores to form the C identifier `foo__bar__baz_bah__unpack`.
- *
- * We also generate descriptor objects for messages and enums. These are
- * declared in the `.pb-c.h` files:
- *
-~~~{.c}
-extern const ProtobufCMessageDescriptor foo__bar__baz_bah__descriptor;
-~~~
- *
- * The message structures all begin with `ProtobufCMessageDescriptor *` which is
- * sufficient to allow them to be cast to `ProtobufCMessage`.
- *
- * For each message defined in a `.proto` file, we generate a number of
- * functions and macros. Each function name contains a prefix based on the
- * package name and message name in order to make it a unique C identifier.
- *
- * - `INIT`. Statically initializes a message object, initializing its
- *   descriptor and setting its fields to default values. Uninitialized
- *   messages cannot be processed by the protobuf-c library.
- *
-~~~{.c}
-#define FOO__BAR__BAZ_BAH__INIT \
- { PROTOBUF_C_MESSAGE_INIT (&foo__bar__baz_bah__descriptor), 0 }
-~~~
- * - `init()`. Initializes a message object, initializing its descriptor and
- *   setting its fields to default values. Uninitialized messages cannot be
- *   processed by the protobuf-c library.
- *
-~~~{.c}
-void foo__bar__baz_bah__init
-                     (Foo__Bar__BazBah *message);
-~~~
- * - `unpack()`. Unpacks data for a particular message format. Note that the
- *   `allocator` parameter is usually `NULL` to indicate that the system's
- *   `malloc()` and `free()` functions should be used for dynamically allocating
- *   memory.
- *
-~~~{.c}
-Foo__Bar__BazBah *
-       foo__bar__baz_bah__unpack
-                     (ProtobufCAllocator  *allocator,
-                      size_t               len,
-                      const uint8_t       *data);
-~~~
- *
- * - `free_unpacked()`. Frees a message object obtained with the `unpack()`
- *   method. Freeing `NULL` is allowed (the same as with `free()`).
- *
-~~~{.c}
-void   foo__bar__baz_bah__free_unpacked
-                     (Foo__Bar__BazBah *message,
-                      ProtobufCAllocator *allocator);
-~~~
- *
- * - `get_packed_size()`. Calculates the length in bytes of the serialized
- *   representation of the message object.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__get_packed_size
-                     (const Foo__Bar__BazBah   *message);
-~~~
- *
- * - `pack()`. Pack a message object into a preallocated buffer. Assumes that
- *   the buffer is large enough. (Use `get_packed_size()` first.)
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack
-                     (const Foo__Bar__BazBah   *message,
-                      uint8_t             *out);
-~~~
- *
- * - `pack_to_buffer()`. Packs a message into a "virtual buffer". This is an
- *   object which defines an "append bytes" callback to consume data as it is
- *   serialized.
- *
-~~~{.c}
-size_t foo__bar__baz_bah__pack_to_buffer
-                     (const Foo__Bar__BazBah   *message,
-                      ProtobufCBuffer     *buffer);
-~~~
- *
- * \page pack Packing and unpacking messages
- *
- * To pack a message, first compute the packed size of the message with
- * protobuf_c_message_get_packed_size(), then allocate a buffer of at least
- * that size, then call protobuf_c_message_pack().
- *
- * Alternatively, a message can be serialized without calculating the final size
- * first. Use the protobuf_c_message_pack_to_buffer() function and provide a
- * ProtobufCBuffer object which implements an "append" method that consumes
- * data.
- *
- * To unpack a message, call the protobuf_c_message_unpack() function. The
- * result can be cast to an object of the type that matches the descriptor for
- * the message.
- *
- * The result of unpacking a message should be freed with
- * protobuf_c_message_free_unpacked().
- */
-
-#ifndef PROTOBUF_C_H
-#define PROTOBUF_C_H
-
-#include <assert.h>
-#include <limits.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-#define PROTOBUF_C__BEGIN_DECLS extern "C" {
-#define PROTOBUF_C__END_DECLS }
-#else
-#define PROTOBUF_C__BEGIN_DECLS
-#define PROTOBUF_C__END_DECLS
-#endif
-
-PROTOBUF_C__BEGIN_DECLS
-
-#if defined(_WIN32) && defined(PROTOBUF_C_USE_SHARED_LIB)
-#ifdef PROTOBUF_C_EXPORT
-#define PROTOBUF_C__API __declspec(dllexport)
-#else
-#define PROTOBUF_C__API __declspec(dllimport)
-#endif
-#else
-#define PROTOBUF_C__API
-#endif
-
-#if !defined(PROTOBUF_C__NO_DEPRECATED) && \
-    ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
-#define PROTOBUF_C__DEPRECATED __attribute__((__deprecated__))
-#else
-#define PROTOBUF_C__DEPRECATED
-#endif
-
-#ifndef PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE
-#define PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(enum_name) \
-  , _##enum_name##_IS_INT_SIZE = INT_MAX
-#endif
-
-#define PROTOBUF_C__SERVICE_DESCRIPTOR_MAGIC 0x14159bc3
-#define PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC 0x28aaeef9
-#define PROTOBUF_C__ENUM_DESCRIPTOR_MAGIC 0x114315af
-
-/* Empty string used for initializers */
-extern const char protobuf_c_empty_string[];
-
-/**
- * \defgroup api Public API
- *
- * This is the public API for `libprotobuf-c`. These interfaces are stable and
- * subject to Semantic Versioning guarantees.
- *
- * @{
- */
-
-/**
- * Values for the `flags` word in `ProtobufCFieldDescriptor`.
- */
-typedef enum {
-  /** Set if the field is repeated and marked with the `packed` option. */
-  PROTOBUF_C_FIELD_FLAG_PACKED = (1 << 0),
-
-  /** Set if the field is marked with the `deprecated` option. */
-  PROTOBUF_C_FIELD_FLAG_DEPRECATED = (1 << 1),
-
-  /** Set if the field is a member of a oneof (union). */
-  PROTOBUF_C_FIELD_FLAG_ONEOF = (1 << 2),
-} ProtobufCFieldFlag;
-
-/**
- * Message field rules.
- *
- * \see [Defining A Message Type] in the Protocol Buffers documentation.
- *
- * [Defining A Message Type]:
- *      https://developers.google.com/protocol-buffers/docs/proto#simple
- */
-typedef enum {
-  /** A well-formed message must have exactly one of this field. */
-  PROTOBUF_C_LABEL_REQUIRED,
-
-  /**
-   * A well-formed message can have zero or one of this field (but not
-   * more than one).
-   */
-  PROTOBUF_C_LABEL_OPTIONAL,
-
-  /**
-   * This field can be repeated any number of times (including zero) in a
-   * well-formed message. The order of the repeated values will be
-   * preserved.
-   */
-  PROTOBUF_C_LABEL_REPEATED,
-
-  /**
-   * This field has no label. This is valid only in proto3 and is
-   * equivalent to OPTIONAL but no "has" quantifier will be consulted.
-   */
-  PROTOBUF_C_LABEL_NONE,
-} ProtobufCLabel;
-
-/**
- * Field value types.
- *
- * \see [Scalar Value Types] in the Protocol Buffers documentation.
- *
- * [Scalar Value Types]:
- *      https://developers.google.com/protocol-buffers/docs/proto#scalar
- */
-typedef enum {
-  PROTOBUF_C_TYPE_INT32,    /**< int32 */
-  PROTOBUF_C_TYPE_SINT32,   /**< signed int32 */
-  PROTOBUF_C_TYPE_SFIXED32, /**< signed int32 (4 bytes) */
-  PROTOBUF_C_TYPE_INT64,    /**< int64 */
-  PROTOBUF_C_TYPE_SINT64,   /**< signed int64 */
-  PROTOBUF_C_TYPE_SFIXED64, /**< signed int64 (8 bytes) */
-  PROTOBUF_C_TYPE_UINT32,   /**< unsigned int32 */
-  PROTOBUF_C_TYPE_FIXED32,  /**< unsigned int32 (4 bytes) */
-  PROTOBUF_C_TYPE_UINT64,   /**< unsigned int64 */
-  PROTOBUF_C_TYPE_FIXED64,  /**< unsigned int64 (8 bytes) */
-  PROTOBUF_C_TYPE_FLOAT,    /**< float */
-  PROTOBUF_C_TYPE_DOUBLE,   /**< double */
-  PROTOBUF_C_TYPE_BOOL,     /**< boolean */
-  PROTOBUF_C_TYPE_ENUM,     /**< enumerated type */
-  PROTOBUF_C_TYPE_STRING,   /**< UTF-8 or ASCII string */
-  PROTOBUF_C_TYPE_BYTES,    /**< arbitrary byte sequence */
-  PROTOBUF_C_TYPE_MESSAGE,  /**< nested message */
-} ProtobufCType;
-
-/**
- * Field wire types.
- *
- * \see [Message Structure] in the Protocol Buffers documentation.
- *
- * [Message Structure]:
- *      https://developers.google.com/protocol-buffers/docs/encoding#structure
- */
-typedef enum {
-  PROTOBUF_C_WIRE_TYPE_VARINT = 0,
-  PROTOBUF_C_WIRE_TYPE_64BIT = 1,
-  PROTOBUF_C_WIRE_TYPE_LENGTH_PREFIXED = 2,
-  /* "Start group" and "end group" wire types are unsupported. */
-  PROTOBUF_C_WIRE_TYPE_32BIT = 5,
-} ProtobufCWireType;
-
-struct ProtobufCAllocator;
-struct ProtobufCBinaryData;
-struct ProtobufCBuffer;
-struct ProtobufCBufferSimple;
-struct ProtobufCEnumDescriptor;
-struct ProtobufCEnumValue;
-struct ProtobufCEnumValueIndex;
-struct ProtobufCFieldDescriptor;
-struct ProtobufCIntRange;
-struct ProtobufCMessage;
-struct ProtobufCMessageDescriptor;
-struct ProtobufCMessageUnknownField;
-struct ProtobufCMethodDescriptor;
-struct ProtobufCService;
-struct ProtobufCServiceDescriptor;
-
-typedef struct ProtobufCAllocator ProtobufCAllocator;
-typedef struct ProtobufCBinaryData ProtobufCBinaryData;
-typedef struct ProtobufCBuffer ProtobufCBuffer;
-typedef struct ProtobufCBufferSimple ProtobufCBufferSimple;
-typedef struct ProtobufCEnumDescriptor ProtobufCEnumDescriptor;
-typedef struct ProtobufCEnumValue ProtobufCEnumValue;
-typedef struct ProtobufCEnumValueIndex ProtobufCEnumValueIndex;
-typedef struct ProtobufCFieldDescriptor ProtobufCFieldDescriptor;
-typedef struct ProtobufCIntRange ProtobufCIntRange;
-typedef struct ProtobufCMessage ProtobufCMessage;
-typedef struct ProtobufCMessageDescriptor ProtobufCMessageDescriptor;
-typedef struct ProtobufCMessageUnknownField ProtobufCMessageUnknownField;
-typedef struct ProtobufCMethodDescriptor ProtobufCMethodDescriptor;
-typedef struct ProtobufCService ProtobufCService;
-typedef struct ProtobufCServiceDescriptor ProtobufCServiceDescriptor;
-
-/** Boolean type. */
-typedef int protobuf_c_boolean;
-
-typedef void (*ProtobufCClosure)(const ProtobufCMessage *, void *closure_data);
-typedef void (*ProtobufCMessageInit)(ProtobufCMessage *);
-typedef void (*ProtobufCServiceDestroy)(ProtobufCService *);
-
-/**
- * Structure for defining a custom memory allocator.
- */
-struct ProtobufCAllocator {
-  /** Function to allocate memory. */
-  void *(*alloc)(void *allocator_data, size_t size);
-
-  /** Function to free memory. */
-  void (*free)(void *allocator_data, void *pointer);
-
-  /** Opaque pointer passed to `alloc` and `free` functions. */
-  void *allocator_data;
-};
-
-/**
- * Structure for the protobuf `bytes` scalar type.
- *
- * The data contained in a `ProtobufCBinaryData` is an arbitrary sequence of
- * bytes. It may contain embedded `NUL` characters and is not required to be
- * `NUL`-terminated.
- */
-struct ProtobufCBinaryData {
-  size_t len;    /**< Number of bytes in the `data` field. */
-  uint8_t *data; /**< Data bytes. */
-};
-
-/**
- * Structure for defining a virtual append-only buffer. Used by
- * protobuf_c_message_pack_to_buffer() to abstract the consumption of serialized
- * bytes.
- *
- * `ProtobufCBuffer` "subclasses" may be defined on the stack. For example, to
- * write to a `FILE` object:
- *
-~~~{.c}
-typedef struct {
-        ProtobufCBuffer base;
-        FILE *fp;
-} BufferAppendToFile;
-
-static void
-my_buffer_file_append(ProtobufCBuffer *buffer,
-                      size_t len,
-                      const uint8_t *data)
-{
-        BufferAppendToFile *file_buf = (BufferAppendToFile *) buffer;
-        fwrite(data, len, 1, file_buf->fp); // XXX: No error handling!
-}
-~~~
- *
- * To use this new type of ProtobufCBuffer, it could be called as follows:
- *
-~~~{.c}
-...
-BufferAppendToFile tmp = {0};
-tmp.base.append = my_buffer_file_append;
-tmp.fp = fp;
-protobuf_c_message_pack_to_buffer(&message, &tmp);
-...
-~~~
- */
-struct ProtobufCBuffer {
-  /** Append function. Consumes the `len` bytes stored at `data`. */
-  void (*append)(ProtobufCBuffer *buffer, size_t len, const uint8_t *data);
-};
-
-/**
- * Simple buffer "subclass" of `ProtobufCBuffer`.
- *
- * A `ProtobufCBufferSimple` object is declared on the stack and uses a
- * scratch buffer provided by the user for the initial allocation. It performs
- * exponential resizing, using dynamically allocated memory. A
- * `ProtobufCBufferSimple` object can be created and used as follows:
- *
-~~~{.c}
-uint8_t pad[128];
-ProtobufCBufferSimple simple = PROTOBUF_C_BUFFER_SIMPLE_INIT(pad);
-ProtobufCBuffer *buffer = (ProtobufCBuffer *) &simple;
-~~~
- *
- * `buffer` can now be used with `protobuf_c_message_pack_to_buffer()`. Once a
- * message has been serialized to a `ProtobufCBufferSimple` object, the
- * serialized data bytes can be accessed from the `.data` field.
- *
- * To free the memory allocated by a `ProtobufCBufferSimple` object, if any,
- * call PROTOBUF_C_BUFFER_SIMPLE_CLEAR() on the object, for example:
- *
-~~~{.c}
-PROTOBUF_C_BUFFER_SIMPLE_CLEAR(&simple);
-~~~
- *
- * \see PROTOBUF_C_BUFFER_SIMPLE_INIT
- * \see PROTOBUF_C_BUFFER_SIMPLE_CLEAR
- */
-struct ProtobufCBufferSimple {
-  /** "Base class". */
-  ProtobufCBuffer base;
-  /** Number of bytes allocated in `data`. */
-  size_t alloced;
-  /** Number of bytes currently stored in `data`. */
-  size_t len;
-  /** Data bytes. */
-  uint8_t *data;
-  /** Whether `data` must be freed. */
-  protobuf_c_boolean must_free_data;
-  /** Allocator to use. May be NULL to indicate the system allocator. */
-  ProtobufCAllocator *allocator;
-};
-
-/**
- * Describes an enumeration as a whole, with all of its values.
- */
-struct ProtobufCEnumDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /** Number elements in `values`. */
-  unsigned n_values;
-  /** Array of distinct values, sorted by numeric value. */
-  const ProtobufCEnumValue *values;
-
-  /** Number of elements in `values_by_name`. */
-  unsigned n_value_names;
-  /** Array of named values, including aliases, sorted by name. */
-  const ProtobufCEnumValueIndex *values_by_name;
-
-  /** Number of elements in `value_ranges`. */
-  unsigned n_value_ranges;
-  /** Value ranges, for faster lookups by numeric value. */
-  const ProtobufCIntRange *value_ranges;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-  /** Reserved for future use. */
-  void *reserved4;
-};
-
-/**
- * Represents a single value of an enumeration.
- */
-struct ProtobufCEnumValue {
-  /** The string identifying this value in the .proto file. */
-  const char *name;
-
-  /** The string identifying this value in generated C code. */
-  const char *c_name;
-
-  /** The numeric value assigned in the .proto file. */
-  int value;
-};
-
-/**
- * Used by `ProtobufCEnumDescriptor` to look up enum values.
- */
-struct ProtobufCEnumValueIndex {
-  /** Name of the enum value. */
-  const char *name;
-  /** Index into values[] array. */
-  unsigned index;
-};
-
-/**
- * Describes a single field in a message.
- */
-struct ProtobufCFieldDescriptor {
-  /** Name of the field as given in the .proto file. */
-  const char *name;
-
-  /** Tag value of the field as given in the .proto file. */
-  uint32_t id;
-
-  /** Whether the field is `REQUIRED`, `OPTIONAL`, or `REPEATED`. */
-  ProtobufCLabel label;
-
-  /** The type of the field. */
-  ProtobufCType type;
-
-  /**
-   * The offset in bytes of the message's C structure's quantifier field
-   * (the `has_MEMBER` field for optional members or the `n_MEMBER` field
-   * for repeated members or the case enum for oneofs).
-   */
-  unsigned quantifier_offset;
-
-  /**
-   * The offset in bytes into the message's C structure for the member
-   * itself.
-   */
-  unsigned offset;
-
-  /**
-   * A type-specific descriptor.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_ENUM`, then `descriptor` points to the
-   * corresponding `ProtobufCEnumDescriptor`.
-   *
-   * If `type` is `PROTOBUF_C_TYPE_MESSAGE`, then `descriptor` points to
-   * the corresponding `ProtobufCMessageDescriptor`.
-   *
-   * Otherwise this field is NULL.
-   */
-  const void *descriptor; /* for MESSAGE and ENUM types */
-
-  /** The default value for this field, if defined. May be NULL. */
-  const void *default_value;
-
-  /**
-   * A flag word. Zero or more of the bits defined in the
-   * `ProtobufCFieldFlag` enum may be set.
-   */
-  uint32_t flags;
-
-  /** Reserved for future use. */
-  unsigned reserved_flags;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * Helper structure for optimizing int => index lookups in the case
- * where the keys are mostly consecutive values, as they presumably are for
- * enums and fields.
- *
- * The data structures requires that the values in the original array are
- * sorted.
- */
-struct ProtobufCIntRange {
-  int start_value;
-  unsigned orig_index;
-  /*
-   * NOTE: the number of values in the range can be inferred by looking
-   * at the next element's orig_index. A dummy element is added to make
-   * this simple.
-   */
-};
-
-/**
- * An instance of a message.
- *
- * `ProtobufCMessage` is a light-weight "base class" for all messages.
- *
- * In particular, `ProtobufCMessage` doesn't have any allocation policy
- * associated with it. That's because it's common to create `ProtobufCMessage`
- * objects on the stack. In fact, that's what we recommend for sending messages.
- * If the object is allocated from the stack, you can't really have a memory
- * leak.
- *
- * This means that calls to functions like protobuf_c_message_unpack() which
- * return a `ProtobufCMessage` must be paired with a call to a free function,
- * like protobuf_c_message_free_unpacked().
- */
-struct ProtobufCMessage {
-  /** The descriptor for this message type. */
-  const ProtobufCMessageDescriptor *descriptor;
-  /** The number of elements in `unknown_fields`. */
-  unsigned n_unknown_fields;
-  /** The fields that weren't recognized by the parser. */
-  ProtobufCMessageUnknownField *unknown_fields;
-};
-
-/**
- * Describes a message.
- */
-struct ProtobufCMessageDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** The qualified name (e.g., "namespace.Type"). */
-  const char *name;
-  /** The unqualified name as given in the .proto file (e.g., "Type"). */
-  const char *short_name;
-  /** Identifier used in generated C code. */
-  const char *c_name;
-  /** The dot-separated namespace. */
-  const char *package_name;
-
-  /**
-   * Size in bytes of the C structure representing an instance of this
-   * type of message.
-   */
-  size_t sizeof_message;
-
-  /** Number of elements in `fields`. */
-  unsigned n_fields;
-  /** Field descriptors, sorted by tag number. */
-  const ProtobufCFieldDescriptor *fields;
-  /** Used for looking up fields by name. */
-  const unsigned *fields_sorted_by_name;
-
-  /** Number of elements in `field_ranges`. */
-  unsigned n_field_ranges;
-  /** Used for looking up fields by id. */
-  const ProtobufCIntRange *field_ranges;
-
-  /** Message initialisation function. */
-  ProtobufCMessageInit message_init;
-
-  /** Reserved for future use. */
-  void *reserved1;
-  /** Reserved for future use. */
-  void *reserved2;
-  /** Reserved for future use. */
-  void *reserved3;
-};
-
-/**
- * An unknown message field.
- */
-struct ProtobufCMessageUnknownField {
-  /** The tag number. */
-  uint32_t tag;
-  /** The wire type of the field. */
-  ProtobufCWireType wire_type;
-  /** Number of bytes in `data`. */
-  size_t len;
-  /** Field data. */
-  uint8_t *data;
-};
-
-/**
- * Method descriptor.
- */
-struct ProtobufCMethodDescriptor {
-  /** Method name. */
-  const char *name;
-  /** Input message descriptor. */
-  const ProtobufCMessageDescriptor *input;
-  /** Output message descriptor. */
-  const ProtobufCMessageDescriptor *output;
-};
-
-/**
- * Service.
- */
-struct ProtobufCService {
-  /** Service descriptor. */
-  const ProtobufCServiceDescriptor *descriptor;
-  /** Function to invoke the service. */
-  void (*invoke)(ProtobufCService *service, unsigned method_index,
-                 const ProtobufCMessage *input, ProtobufCClosure closure,
-                 void *closure_data);
-  /** Function to destroy the service. */
-  void (*destroy)(ProtobufCService *service);
-};
-
-/**
- * Service descriptor.
- */
-struct ProtobufCServiceDescriptor {
-  /** Magic value checked to ensure that the API is used correctly. */
-  uint32_t magic;
-
-  /** Service name. */
-  const char *name;
-  /** Short version of service name. */
-  const char *short_name;
-  /** C identifier for the service name. */
-  const char *c_name;
-  /** Package name. */
-  const char *package;
-  /** Number of elements in `methods`. */
-  unsigned n_methods;
-  /** Method descriptors, in the order defined in the .proto file. */
-  const ProtobufCMethodDescriptor *methods;
-  /** Sort index of methods. */
-  const unsigned *method_indices_by_name;
-};
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A string containing the version number of protobuf-c.
- */
-PROTOBUF_C__API
-const char *protobuf_c_version(void);
-
-/**
- * Get the version of the protobuf-c library. Note that this is the version of
- * the library linked against, not the version of the headers compiled against.
- *
- * \return A 32 bit unsigned integer containing the version number of
- *      protobuf-c, represented in base-10 as (MAJOR*1E6) + (MINOR*1E3) + PATCH.
- */
-PROTOBUF_C__API
-uint32_t protobuf_c_version_number(void);
-
-/**
- * The version of the protobuf-c headers, represented as a string using the same
- * format as protobuf_c_version().
- */
-#define PROTOBUF_C_VERSION "1.3.0"
-
-/**
- * The version of the protobuf-c headers, represented as an integer using the
- * same format as protobuf_c_version_number().
- */
-#define PROTOBUF_C_VERSION_NUMBER 1003000
-
-/**
- * The minimum protoc-c version which works with the current version of the
- * protobuf-c headers.
- */
-#define PROTOBUF_C_MIN_COMPILER_VERSION 1000000
-
-/**
- * Determine the number of bytes required to store the serialised message.
- *
- * \param message
- *      The message object to serialise.
- * \return
- *      Number of bytes.
- */
-PROTOBUF_C__API
-size_t protobuf_c_message_get_packed_size(const ProtobufCMessage *message);
-
-/**
- * Unpack a serialised message into an in-memory representation.
- *
- * \param descriptor
- *      The message descriptor.
- * \param allocator
- *      `ProtobufCAllocator` to use for memory allocation. May be NULL to
- *      specify the default allocator.
- * \param len
- *      Length in bytes of the serialised message.
- * \param data
- *      Pointer to the serialised message.
- * \return
- *      An unpacked message object.
- * \retval NULL
- *      If an error occurred during unpacking.
- */
-PROTOBUF_C__API
-ProtobufCMessage *protobuf_c_message_unpack(
-    const ProtobufCMessageDescriptor *descriptor, ProtobufCAllocator *allocator,
-    size_t len, const uint8_t *data);
-
-/**
- * Free an unpacked message object.
- *
- * This function should be used to deallocate the memory used by a call to
- * protobuf_c_message_unpack().
- *
- * \param message
- *      The message object to free. May be NULL.
- * \param allocator
- *      `ProtobufCAllocator` to use for memory deallocation. May be NULL to
- *      specify the default allocator.
- */
-PROTOBUF_C__API
-void protobuf_c_message_free_unpacked(ProtobufCMessage *message,
-                                      ProtobufCAllocator *allocator);
-
-/**
- * Check the validity of a message object.
- *
- * Makes sure all required fields (`PROTOBUF_C_LABEL_REQUIRED`) are present.
- * Recursively checks nested messages.
- *
- * \retval TRUE
- *      Message is valid.
- * \retval FALSE
- *      Message is invalid.
- */
-PROTOBUF_C__API
-protobuf_c_boolean protobuf_c_message_check(const ProtobufCMessage *);
-
-/** Message initialiser. */
-#define PROTOBUF_C_MESSAGE_INIT(descriptor) \
-  { descriptor, 0, NULL }
-
-/**
- * Initialise a message object from a message descriptor.
- *
- * \param descriptor
- *      Message descriptor.
- * \param message
- *      Allocated block of memory of size `descriptor->sizeof_message`.
- */
-PROTOBUF_C__API
-void protobuf_c_message_init(const ProtobufCMessageDescriptor *descriptor,
-                             void *message);
-
-/**
- * Initialise a `ProtobufCBufferSimple` object.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_INIT(array_of_bytes)             \
-  {                                                               \
-    {protobuf_c_buffer_simple_append}, sizeof(array_of_bytes), 0, \
-        (array_of_bytes), 0, NULL                                 \
-  }
-
-/**
- * Clear a `ProtobufCBufferSimple` object, freeing any allocated memory.
- */
-#define PROTOBUF_C_BUFFER_SIMPLE_CLEAR(simp_buf)                              \
-  do {                                                                        \
-    if ((simp_buf)->must_free_data) {                                         \
-      if ((simp_buf)->allocator != NULL)                                      \
-        (simp_buf)->allocator->free((simp_buf)->allocator, (simp_buf)->data); \
-      else                                                                    \
-        free((simp_buf)->data);                                               \
-    }                                                                         \
-  } while (0)
-
-/**
- * The `append` method for `ProtobufCBufferSimple`.
- *
- * \param buffer
- *      The buffer object to append to. Must actually be a
- *      `ProtobufCBufferSimple` object.
- * \param len
- *      Number of bytes in `data`.
- * \param data
- *      Data to append.
- */
-PROTOBUF_C__API
-void protobuf_c_buffer_simple_append(ProtobufCBuffer *buffer, size_t len,
-                                     const unsigned char *data);
-
-/**@}*/
-
-PROTOBUF_C__END_DECLS
-
-#endif /* PROTOBUF_C_H */
diff --git a/mobile/tools/quantification/src/tensor_desc.h b/mobile/tools/quantification/src/tensor_desc.h
deleted file mode 100644
index 4eadf341db998ae12939d252d585051ba54c3bf0..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/tensor_desc.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "src/framework.pb-c.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-enum VarType_Type {
-  VARTYPE_TYPE_BOOL = 0,
-  VARTYPE_TYPE_INT16 = 1,
-  VARTYPE_TYPE_INT32 = 2,
-  VARTYPE_TYPE_INT64 = 3,
-  VARTYPE_TYPE_FP16 = 4,
-  VARTYPE_TYPE_FP32 = 5,
-  VARTYPE_TYPE_FP64 = 6,
-  VARTYPE_TYPE_LOD_TENSOR = 7,
-  VARTYPE_TYPE_SELECTED_ROWS = 8,
-  VARTYPE_TYPE_FEED_MINIBATCH = 9,
-  VARTYPE_TYPE_FETCH_LIST = 10,
-  VARTYPE_TYPE_STEP_SCOPES = 11,
-  VARTYPE_TYPE_STEP_LOD_RANK_TABLE = 12,
-  VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY = 13,
-  VARTYPE_TYPE_STEP_PLACE_LIST = 14,
-  VARTYPE_TYPE_READER = 15,
-  VARTYPE_TYPE_CHANNEL = 16,
-  VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18
-};
-
-class TensorDesc {
- public:
-  TensorDesc() = default;
-  TensorDesc(const TensorDesc &desc) {
-    this->dims_ = desc.dims_;
-    this->data_type_ = desc.data_type_;
-  }
-
-  explicit TensorDesc(
-      PaddleMobile__Framework__Proto__VarType__TensorDesc *desc) {
-    for (int i = 0; i < desc->n_dims; ++i) {
-      int64_t d = desc->dims[i];
-      dims_.emplace_back(d);
-    }
-    data_type_ = (VarType_Type)desc->data_type;
-  }
-
-  std::vector<int64_t> Dims() const { return dims_; }
-  VarType_Type DataType() const { return data_type_; }
-
- private:
-  std::vector<int64_t> dims_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/src/var_desc.h b/mobile/tools/quantification/src/var_desc.h
deleted file mode 100644
index 0b9c5ac4d672be2dd8a8a2a2695c2816f9cae05a..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/src/var_desc.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "src/framework.pb-c.h"
-#include "src/tensor_desc.h"
-
-namespace paddle_mobile {
-namespace framework {
-
-class VarDesc {
- public:
-  VarDesc(const VarDesc &var_desc) {
-    this->data_type_ = var_desc.data_type_;
-    this->name_ = var_desc.name_;
-    this->persistable_ = var_desc.persistable_;
-    this->tensor_desc_ = var_desc.tensor_desc_;
-    this->type_ = var_desc.type_;
-  }
-  explicit VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
-    type_ = (VarType_Type)desc->type->type;
-    name_ = std::string(desc->name);
-    persistable_ = static_cast<bool>(desc->persistable);
-
-    switch (type_) {
-      case VARTYPE_TYPE_SELECTED_ROWS:
-        tensor_desc_ = TensorDesc(desc->type->selected_rows);
-        break;
-      case VARTYPE_TYPE_LOD_TENSOR:
-        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
-        break;
-      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        // desc->type->tensor_array->tensor->data_type;
-        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
-
-        break;
-      default:
-        break;
-    }
-    switch (type_) {
-      case VARTYPE_TYPE_CHANNEL:
-        data_type_ = (VarType_Type)desc->type->channel->data_type;
-        break;
-      default:
-        data_type_ = tensor_desc_.DataType();
-        break;
-    }
-  }
-  std::string Name() const { return name_; }
-
-  VarType_Type Type() const { return type_; }
-
-  bool Persistable() const { return persistable_; }
-
-  const TensorDesc &Tensor_desc() const { return tensor_desc_; }
-
- private:
-  std::string name_;
-  bool persistable_;
-  TensorDesc tensor_desc_;
-  VarType_Type type_;
-  VarType_Type data_type_;
-};
-
-}  // namespace framework
-}  // namespace paddle_mobile
diff --git a/mobile/tools/quantification/tune_n_fold.py b/mobile/tools/quantification/tune_n_fold.py
deleted file mode 100644
index 6126a397b33f5c51cd2bcfad265e313e7fa84657..0000000000000000000000000000000000000000
--- a/mobile/tools/quantification/tune_n_fold.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*
-
-import os
-import sys
-import math
-import subprocess
-import numpy as np
-import paddle.fluid as fluid
-
-def sh(command):
-    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    return pipe.stdout.read().decode("utf-8")
-
-for fold in range(100, 1001, 100):
-    print("checking fold : {}".format(fold))
-    max_entropy = sh("./quantify 1 model params {}".format(fold))
-    print("max entropy :", max_entropy, end="")
-    sh("rm -rf scripts/model")
-    sh("rm -rf scripts/quantification_model")
-    sh("cp -r model scripts/model")
-    sh("cp -r model scripts/quantification_model")
-    sh("mv params scripts/quantification_model")
-    diff = sh("cd scripts && python run.py {}".format(fold))
-    print("output diff :", diff, end="")
diff --git a/mobile/tools/shell/change_mobile_namespace.sh b/mobile/tools/shell/change_mobile_namespace.sh
deleted file mode 100755
index aaad6ac1930cc69ac06740cc09f48ddab67054ab..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/change_mobile_namespace.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-# set -o xtrace
-
-extension=$1
-
-convert () {
-    perl -pi -e "s/namespace paddle_mobile/namespace paddle_mobile_${1}/g" "${2}"
-    perl -pi -e "s/paddle_mobile::/paddle_mobile_${1}::/g" "${2}"
-}
-
-revert () {
-    perl -pi -e "s/namespace paddle_mobile_[\w]*/namespace paddle_mobile/g" "${2}"
-    perl -pi -e "s/paddle_mobile_[\w]*::/paddle_mobile::/g" "${2}"
-}
-
-if [[ $2 == "revert" ]]; then
-    for file in $(find src -name "*\.*")
-    do
-        echo "reverting ${file}"
-        revert $extension $file
-    done
-    for file in $(find test -name "*\.*")
-    do
-        echo "reverting ${file}"
-        revert $extension $file
-    done
-else
-    for file in $(find src -name "*\.*")
-    do
-        echo "converting ${file}"
-        convert $extension $file
-    done
-    for file in $(find test -name "*\.*")
-    do
-        echo "converting ${file}"
-        convert $extension $file
-    done
-fi
diff --git a/mobile/tools/shell/check-bitcode.sh b/mobile/tools/shell/check-bitcode.sh
deleted file mode 100644
index a13cfac9c79507036ceb3a50dff87feb2a6dadd3..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/check-bitcode.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/sh
-
-archs=(armv7 armv7s arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "checking bitcode in ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-            lipo -extract $arch $library -o ${library}_${arch}.a
-    done
-done
-
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        echo "checking ${library}_${arch}.a"
-        printf "\tbitcode symbol number "
-        otool -l ${library}_${arch}.a | grep bitcode | wc -l
-        # Delete intermediate files
-        rm ${library}_${arch}.a
-    done
-done
-
-echo "bitcode checking complete."
diff --git a/mobile/tools/shell/check-filename.sh b/mobile/tools/shell/check-filename.sh
deleted file mode 100644
index 53eacc8c0e6698a8d05704cbe73d47ef624f3e81..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/check-filename.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-archs=(armv7 armv7s arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "checking filename in ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-        lipo $library -thin armv7 -output ${library}_${arch}.a
-    done
-done
-
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        archlib=${library}_${arch}.a
-        echo "checking $archlib"
-        mkdir tmp_check_dir
-        cp $archlib tmp_check_dir
-        cd tmp_check_dir
-        ar -x $archlib
-        ls -alh | grep $1
-        echo ""
-        cd ..
-        # Delete intermediate files
-        rm ${library}_${arch}.a
-        rm -rf tmp_check_dir
-    done
-done
-
-echo "filename checking complete."
diff --git a/mobile/tools/shell/generate-include/.gitignore b/mobile/tools/shell/generate-include/.gitignore
deleted file mode 100644
index af9eaaeff8ffa2a35915b332dde55624f6de7c9a..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/generate-include/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-include
-include.zip
diff --git a/mobile/tools/shell/generate-include/check_include_diff.sh b/mobile/tools/shell/generate-include/check_include_diff.sh
deleted file mode 100644
index eb3dd9d1dc45aceb42991964c7cd4df1b14849b0..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/generate-include/check_include_diff.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env
-
-include1=$1
-include2=$2
-
-root=$(pwd)
-
-cd $include1
-list1=$(find . -name "*" | sort -n | uniq)
-cd $root
-echo "$list1" > include1.list
-
-cd $include2
-list2=$(find . -name "*" | sort -n | uniq)
-cd $root
-echo "$list2" > include2.list
-
-diff include1.list include2.list
-
-if [ "$?" = "0" ]
-then
-    echo "no diff"
-else
-    echo "has diff"
-fi
-
-rm include1.list
-rm include2.list
-
-echo "done"
diff --git a/mobile/tools/shell/generate-include/main.cpp b/mobile/tools/shell/generate-include/main.cpp
deleted file mode 100644
index 720f09f11a3d81a1c5d0503f746f2ecce56a60e4..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/generate-include/main.cpp
+++ /dev/null
@@ -1,6 +0,0 @@
-#include "io/paddle_mobile.h"
-#include "io/paddle_inference_api.h"
-
-int main() {
-    return 0;
-}
diff --git a/mobile/tools/shell/generate-include/parse.py b/mobile/tools/shell/generate-include/parse.py
deleted file mode 100644
index ba5445c68be95b901833db534c2c3146577e0bb6..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/generate-include/parse.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import sys
-
-output = ""
-for line in sys.stdin:
-    line.strip()
-    tag = "\\"
-    if tag in line:
-        index = line.index("\\")
-        line = line[:index]
-    output += line
-for line in output.split(" "):
-    line = line.strip()
-    if "/Applications" in line:
-        continue
-    if len(line) <= 0:
-        continue
-    if not line.endswith(".h"):
-        continue
-    if not line.startswith("../../../src/"):
-        continue
-    print(line[len("../../../src/"):])
diff --git a/mobile/tools/shell/generate-include/run.sh b/mobile/tools/shell/generate-include/run.sh
deleted file mode 100755
index 1af1bce4165a26b705c38aa02380e4273aa51778..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/generate-include/run.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-
-rm -rf include
-
-mkdir include
-
-g++ -I../../../src/ -M main.cpp | python parse.py | xargs -I % sh -c "dirname %" | sort | uniq | xargs -I % sh -c "mkdir -p include/%"
-
-g++ -I../../../src/ -M main.cpp | python parse.py | xargs -I % sh -c "cp ../../../src/% include/%"
diff --git a/mobile/tools/shell/merge.sh b/mobile/tools/shell/merge.sh
deleted file mode 100644
index 08c19d9286f57eae8a62ead711331b3fa07c1dd5..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/merge.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/sh
-
-# Combined all static libaries in the current directory into a single static library
-# It is hardcoded to use the i386, armv7, and armv7s architectures; this can easily be changed via the 'archs' variable at the top
-# The script takes a single argument, which is the name of the final, combined library to be created.
-#
-#   For example:
-#  =>    combine_static_libraries.sh combined-library
-#
-# Script by Evan Schoenberg, Regular Rate and Rhythm Software
-# Thanks to Claudiu Ursache for his blog post at http://www.cvursache.com/2013/10/06/Combining-Multi-Arch-Binaries/ which detailed the technique automated by this script
-#####
-# $1 = Name of output archive
-#####
-
-# archs=(i386 armv7 armv7s)
-archs=(armv7 arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-echo "Combining ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-            lipo -extract $arch $library -o ${library}_${arch}.a
-    done
-done
-
-# Combine results of the same architecture into a library for that architecture
-source_combined=""
-for arch in ${archs[*]}
-do
-    source_libraries=""
-    
-    for library in ${libraries[*]}
-    do
-        source_libraries="${source_libraries} ${library}_${arch}.a"
-    done
-    
-    $libtool -static ${source_libraries} -o "${1}_${arch}.a"
-    source_combined="${source_combined} ${1}_${arch}.a"
-    
-    # Delete intermediate files
-    rm ${source_libraries}
-done
-
-# Merge the combined library for each architecture into a single fat binary
-lipo -create $source_combined -o $1.a
-
-# Delete intermediate files
-rm ${source_combined}
-
-# Show info on the output library as confirmation
-echo "Combination complete."
-lipo -info $1.a
diff --git a/mobile/tools/shell/prune_static_library.sh b/mobile/tools/shell/prune_static_library.sh
deleted file mode 100644
index 1b555e92bbb4fd1597708fc2b0ab0e4ba0d8ed01..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/prune_static_library.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-# Split all static libaries in the current directory into corresponding archtectures
-
-archs=(armv7 arm64)
-libraries=(*.a)
-libtool="/usr/bin/libtool"
-
-rm -rf tmp
-mkdir tmp
-
-echo "splitting and pruning ${libraries[*]}..."
-
-for library in ${libraries[*]}
-do
-    lipo -info $library
-    # Extract individual architectures for this library
-    for arch in ${archs[*]}
-    do
-        mkdir -p tmp/$arch
-        lipo -thin $arch $library -o ./tmp/$arch/${library}
-        cd tmp/$arch
-        ar x $library
-        rm $library
-        ar -rcs $library *.o
-        cd ../..
-    done
-done
-
-echo "joining static libriries..."
-cd tmp
-libtool -static -o $library armv7/$library arm64/$library
-
-# # split static library into objects
-# ar x 1.a
-# # join objects into static library
-# ar -rcs 2.a *.o
-# # join static libraries into one single static library
-# libtool -static -o 3.a 1.a 2.a
-# # list file by file size, prune according to file size
-# ls -Slhr directory
diff --git a/mobile/tools/shell/restore-private-repo.sh b/mobile/tools/shell/restore-private-repo.sh
deleted file mode 100644
index d9d29ed3e53abc6a09c8e9dee007fcdebac1d1e1..0000000000000000000000000000000000000000
--- a/mobile/tools/shell/restore-private-repo.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-
-git clone https://icode.baidu.com/baidu/bdbox/paddle-mobile-private-repo/
-
-cp -R paddle-mobile-private-repo/paddle-mobile-metallib ../../metal/
diff --git a/mobile/tools/toolchains/arm-android-neon.cmake b/mobile/tools/toolchains/arm-android-neon.cmake
deleted file mode 100644
index 5e431059a974810b2fd0481e0942447f57bf1286..0000000000000000000000000000000000000000
--- a/mobile/tools/toolchains/arm-android-neon.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-set(ANDROID_ARM_NEON ON)
-set(ANDROID_PIE TRUE)
-set(ANDROID_STL "c++_static")
-set(ANDROID_PLATFORM "android-22")
-include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
diff --git a/mobile/tools/toolchains/arm-linux-gnueabi.cmake b/mobile/tools/toolchains/arm-linux-gnueabi.cmake
deleted file mode 100644
index c2b1b853def5f470565e670751708f76c59e16c4..0000000000000000000000000000000000000000
--- a/mobile/tools/toolchains/arm-linux-gnueabi.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMake toolchain file for building ARM software on Linux environment
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_VERSION 1)
-
-set(CMAKE_C_COMPILER   /usr/bin/arm-linux-gnueabi-gcc)
-set(CMAKE_CXX_COMPILER /usr/bin/arm-linux-gnueabi-g++)
-set(CMAKE_STRIP /usr/bin/arm-linux-gnueabi-strip)
-
-set(CMAKE_FIND_ROOT_PATH  /usr/arm-linux-gnueabi)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-
-set(ARM_LINUX 1)
diff --git a/mobile/tools/toolchains/arm-linux-gnueabihf.cmake b/mobile/tools/toolchains/arm-linux-gnueabihf.cmake
deleted file mode 100644
index 2b8729cd9db05f34959e936d8e0c1e2bdc529338..0000000000000000000000000000000000000000
--- a/mobile/tools/toolchains/arm-linux-gnueabihf.cmake
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMake toolchain file for building ARM software on Linux environment
-
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR arm)
-set(CMAKE_SYSTEM_VERSION 1)
-
-set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
-set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)